def main(dataset: str, target: str, symbol: str): ds_service = DatasetService() ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target='class') # hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml", importances=fs.feature_importances) # hdf = pd.DataFrame(hierarchy) # fig = px.treemap(hdf, path=['category', 'subgroup', 'name'], values='importance') # fig.show() # # fig = px.sunburst(hdf, path=['category', 'subgroup', 'name'], values='importance') # fig.show() shap_values, shap_expected_values = parse_shap_values(fs.shap_values) X = ds_service.get_dataset_features(ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) y = ds_service.get_target(name='class', symbol=symbol, begin=fs.search_interval.begin, end=fs.search_interval.end) fig = plt.figure() plt.suptitle(f"Shap summary plot for {dataset}.{symbol} -> {target}") shap.summary_plot(shap_values, X, class_names=["SELL", "HOLD", "BUY"], show=False, max_display=352, use_log_scale=True) plt.tight_layout() fig.show() shap_dfs = [] for cls, arr in enumerate(shap_values): class_df = pd.DataFrame(arr, columns=X.columns, index=X.index) class_df.columns = [f"{c}_class{cls}" for c in class_df.columns] shap_dfs.append(class_df) shap_df = pd.concat(shap_dfs, axis='columns') shap_df = shap_df.reindex(sorted(shap_df.columns), axis=1) print(shap_df.head())
def create_model_test(self, *, model: Model, split=0.7, step=None, task_key=None, window=None, **kwargs): service = DatasetService() ds = service.get_dataset(model.dataset, model.symbol) splits = DatasetService.get_train_test_split_indices(ds, split) parameters = kwargs.get('parameters') features = kwargs.get('features') if isinstance(parameters, str) and parameters == 'latest': if model.parameters: parameters = model.parameters[-1].parameters else: parameters = None if isinstance(features, str): fs = DatasetService.get_feature_selection(ds=ds, method=features, target=model.target) if fs: features = fs.features else: features = None result = ModelTest(window=window or {'days': 30}, step=step or ds.interval, parameters=parameters or {}, features=features or [], test_interval=splits['test'], task_key=task_key or str(uuid4())) return result
def main(): models = ModelService() datasets = DatasetService() query = { "dataset": "merged_new", "target": "class" } all_models = models.query_models(query=query) for m in all_models: ds = datasets.get_dataset(name=m.dataset, symbol=m.symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=m.target) if not fs: logging.error(f"Dataset {m.dataset}{m.symbol} -> {m.target} does not have feature selection") continue if not m.parameters: logging.error(f"Model {m.pipeline}({m.dataset}{m.symbol}) -> {m.target} does not have parameters") continue for mp in m.parameters: count = 0 for f in mp.features: if not f in fs.features: logging.error(f"Model {m.pipeline}({m.dataset}{m.symbol}) -> {m.target} parameter search done without fixing features!") else: count += 1 logging.info(f"Model {m.pipeline}({m.dataset}{m.symbol}) -> {m.target} GRIDSEARCH {mp.parameter_search_method} done with {count} features")
def predict_day(self, pipeline: str, dataset: str, target: str, symbol: str, day: str, window: dict): model = self.get_model(pipeline, dataset, target, symbol) # Load dataset ds = DatasetService() d = ds.get_dataset(model.dataset, model.symbol) # Get training data including the first training window begin = sub_interval(timestamp=day, interval=window) if from_timestamp(d.valid_index_min).timestamp() > from_timestamp( begin).timestamp(): raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \ .format(model.pipeline, model.dataset, model.symbol, window)) X = ds.get_features(model.dataset, model.symbol, begin=begin, end=day) y = ds.get_target(model.target, model.symbol, begin=begin, end=day) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(model.symbol, model.dataset, model.target, model.pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Load pipeline pipeline_module = get_pipeline(model.pipeline) # Slice testing interval in windows df = predict_day(pipeline_module.estimator, model.parameters[-1], X, y, day) return df
def test_model(self, model: Model, mt: ModelTest, **kwargs): if not model.id: model = self.model_repo.create(model) if self.model_repo.exist_test(model.id, mt.task_key): logging.info("Model {} test {} already executed!".format( model.id, mt.task_key)) return mt # Load dataset ds = DatasetService() d = ds.get_dataset(model.dataset, model.symbol) # Get training data including the first training window begin = sub_interval(timestamp=mt.test_interval.begin, interval=mt.window) end = add_interval(timestamp=mt.test_interval.end, interval=mt.step) if from_timestamp(d.valid_index_min).timestamp() > from_timestamp( begin).timestamp(): raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \ .format(model.pipeline, model.dataset, model.symbol, mt.window)) X = ds.get_features(model.dataset, model.symbol, begin=begin, end=end) y = ds.get_target(model.target, model.symbol, begin=begin, end=end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(model.symbol, model.dataset, model.target, model.pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Load pipeline pipeline_module = get_pipeline(model.pipeline) # Slice testing interval in windows ranges = timestamp_windows(begin, end, mt.window, mt.step) mt.start_at = get_timestamp() df = test_windows(pipeline_module.estimator, mt.parameters, X, y, ranges) mt.end_at = get_timestamp() mt.classification_results = df.to_dict() clf_report = flattened_classification_report_imbalanced( df.label, df.predicted) roc_report = roc_auc_report( df.label, df.predicted, df[[c for c in df.columns if '_proba_' in c]]) clf_report.update(roc_report) mt.classification_report = clf_report self.model_repo.append_test(model.id, mt) return mt
def main(dataset: str): ds_service = DatasetService() symbols = ds_service.get_dataset_symbols(name=dataset) ds_data = { s: ds_service.get_dataset(name=dataset, symbol=s).features for s in symbols } # We need to reshape / flatten data records = [] symbol_lookup = {s: i for i, s in enumerate(symbols)} for symbol, features in ds_data.items(): record = { 'symbol': symbol.replace('USD', ''), #'symbol_id': symbol_lookup[symbol] } for f in features: if f.startswith('adrbal1in') and f.endswith('cnt'): f = 'adrbal1in{N}cnt' elif f.startswith('adrbalntv') and f.endswith('cnt'): f = 'adrbalntv{N}cnt' elif f.startswith('splyact') and not 'pct' in f: f = 'splyact{T}' elif f.startswith('splyadrbal1in'): f = 'splyadrbal1in{N}' elif f.startswith('splyadrbalntv'): f = 'splyadrbalntv{N}' elif f.startswith('splyadrtop'): f = 'splyadrtop{N}' elif f.startswith('adrbalusd') and f.endswith('cnt'): f = 'adrbalusd{N}cnt' elif f.startswith('splyadrbalusd'): f = 'splyadrbalusd{N}' elif f.startswith('txtfrval') and f.endswith('ntv'): f = 'txtfrval{A}ntv' elif f.startswith('txtfrval') and f.endswith('usd'): f = 'txtfrval{A}usd' elif f.startswith('fee') and f.endswith('usd'): f = 'fee{A}usd' elif f.startswith('gaslmtblk'): f = 'gaslmtblk' elif f.startswith('gaslmttx'): f = 'gaslmttx' elif f.startswith('gasusedtx'): f = 'gasusedtx' elif f.startswith('isccont'): f = 'isscont' record[f] = 'Y' records.append(record) result_frame = pd.DataFrame.from_records(records).fillna(value='N') #result_frame.set_index(keys='symbol', inplace=True) result_frame = result_frame.set_index(keys='symbol').T latex = result_frame.to_latex() print(result_frame.head())
def get_dataset( symbol: str, dataset: Optional[str] = None, target: Optional[str] = None, begin: Optional[str] = None, end: Optional[str] = None, service: DatasetService = Depends(DatasetService), ): if not dataset and not target: raise HTTPException( status_code=400, detail= "At least one of 'dataset' or 'target' parameters must be specified!" ) _name = dataset if not _name: _name = 'target' d = service.get_dataset(name=_name, symbol=symbol) # If begin/end not specified, use recorded. # If auto use valid. if not begin: begin = d.index_min elif begin == 'auto': begin = d.valid_index_min if not end: end = d.index_max elif end == 'auto': end = d.valid_index_max # Retrieve dataframes dfs = [] if dataset: df = service.get_features(name=dataset, symbol=symbol, begin=begin, end=end) dfs.append(df) if target: dfs.append( service.get_target(name=target, symbol=symbol, begin=begin, end=end)) # Concatenate dataframes and target res = pd.concat(dfs, axis='columns') if len(dfs) > 1 else dfs[0] # Return CSV return res.to_csv(index_label='time')
def main(dataset: str, target: str, pipeline: str): shapes = [] ds_service = DatasetService() m_service = ModelService() for symbol in SYMBOLS: print(f"Exporting shap dataframes for symbol {symbol}") ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=target) X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features) y_all = ds_service.get_dataset_target(ds=ds, name=target) model = m_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) for t in model.tests: print(f"Loading estimators for test {t.window}") estimators = ModelService.load_test_estimators(model=model, mt=t) shaps = [] print(f"Calculating shap values...") for est in tqdm(estimators): est_class = y_all.loc[est.day] shap_v, shap_exp = get_shap_values(estimator=est, X=X_all.loc[est.day], X_train=est.train_x, bytes=False) df = pd.DataFrame([shap_v], index=[pd.to_datetime(est.day)], columns=X_all.columns) df['label'] = y_all.loc[est.day] df['shap_expected'] = shap_exp shaps.append(df) print("Exporting dataframe..") cdf = pd.concat(shaps, axis='index') os.makedirs(f"data/shap_values/{dataset}/{target}/{pipeline}/", exist_ok=True) cdf.to_csv( f"data/shap_values/{dataset}/{target}/{pipeline}/shap_test_{symbol}_Wdays{t.window['days']}.csv", index_label='time') print("Exported.") # # Load day estimator # est = load_estimator() print(f"Plotted {symbol}")
def main(dataset: str, target: str, symbol: str): ds_service = DatasetService() ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target='class') # shap_values, shap_expected_values = parse_shap_values(fs.shap_values) # X = ds_service.get_dataset_features(ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) # shap_df_0 = pd.DataFrame(data=shap_values[0], index=X.index, columns=X.columns) # shap_df_1 = pd.DataFrame(data=shap_values[1], index=X.index, columns=X.columns) # shap_df_2 = pd.DataFrame(data=shap_values[2], index=X.index, columns=X.columns) hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml", importances=fs.feature_importances) # for record in hierarchy: # feature = record['name'] # try: # record['shap_mean_0'] = shap_df_0[feature].mean() # record['shap_mean_1'] = shap_df_1[feature].mean() # record['shap_mean_2'] = shap_df_2[feature].mean() # except KeyError as e: # print(f"Feature {feature} not in dataset!") # record['shap_mean_0'] = np.nan # record['shap_mean_1'] = np.nan # record['shap_mean_2'] = np.nan # pass os.makedirs(f"data/selection_{dataset}_{target}/", exist_ok=True) hdf = pd.DataFrame(hierarchy) csv_name = f"data/selection_{dataset}_{target}/{symbol}_feature_importances.csv" hdf.to_csv(csv_name, index_label='index') print(f"Augmented importances dataframe exported to {csv_name}") csv_name = f"data/selection_{dataset}_{target}/{symbol}_feature_importances_selected.csv" hdf[hdf.name.isin(fs.features)].to_csv(csv_name, index_label='index') print(f"Augmented selected features dataframe exported to {csv_name}")
def create_classification_models(self, query, pipeline): ds = DatasetService() models = [] if query is None: query = {{ "type": "FEATURES", }} datasets = ds.query(query) # All possible combinations all_models = {} for d in datasets: # Get targets for this symbol tgt = ds.get_dataset('target', d.symbol) if not d.symbol in all_models: all_models[d.symbol] = [] for t, p in itertools.product(tgt.features, PIPELINE_LIST): if t in ['price', 'pct']: continue all_models[d.symbol].append((d, t, p)) # Method to process a batch of items results = Parallel(n_jobs=-1)( delayed(create_models_batch)(symbol, items) for symbol, items in all_models.items()) return [item for sublist in results for item in sublist]
def main(dataset: str, target: str): num_shap_plots = 3 shap_show_count = 10 ds_service = DatasetService() m_service = ModelService() for pipeline in PIPELINES: for symbol in SYMBOLS: print( f"Plotting shap dataframes for pipeline {pipeline} symbol {symbol}" ) ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection( ds=ds, method='importances_shap', target=target) X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features) y_all = ds_service.get_dataset_target(ds=ds, name=target) model = m_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) for t in model.tests: placeholder = "{label}" csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv" expected_csv_name = csv_name.format(label='SHAP_expected') print(f"Loading results for test {t.window}") results = ModelService.parse_test_results(test=t) exp_shap_df = pd.read_csv(expected_csv_name, index_col='time', parse_dates=True) for cls, label in enumerate(["SELL", "HOLD", "BUY"]): class_csv_name = csv_name.format(label=label) cls_shap_df = pd.read_csv(class_csv_name, index_col='time', parse_dates=True) cls_shap_df = cls_shap_df.loc[t.test_interval.begin:t. test_interval.end] x_train = X_all.loc[cls_shap_df.index] chunk_size = int(cls_shap_df.shape[0] / num_shap_plots) fig = plt.figure(constrained_layout=True, figsize=(100, 50), dpi=300) # gs = GridSpec(3, num_shap_plots, figure=fig, wspace=1.5, hspace=0.3) precision_ax = fig.add_subplot(gs[0, :]) shap_values_ax = fig.add_subplot(gs[1, :]) beeswarms_axs = [ fig.add_subplot(gs[2, i]) for i in range(num_shap_plots) ] #format_axes(fig) shap_plot_labels = set() first_shap_day = results.iloc[0]['time'].replace( '+00:00', '').replace('T', '').replace(':', '').replace('-', '') middle_shap_day = results.iloc[int( results.shape[0] / 2)]['time'].replace( '+00:00', '').replace('T', '').replace(':', '').replace('-', '') last_shap_day = results.iloc[-1]['time'].replace( '+00:00', '').replace('T', '').replace(':', '').replace('-', '') for idx, dayname in enumerate( [first_shap_day, middle_shap_day, last_shap_day]): day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/daily/shap_training_window_{symbol}_{label}_Wdays{t.window['days']}_DAY{dayname}.csv" # Plot each section's SHAP values cdf_subset = pd.read_csv(day_csv_name, index_col='time', parse_dates=True) train_subset = X_all.loc[cdf_subset.index] # Get a rank of feature labels based on this section's shap values abs_mean_shap = cdf_subset.abs().mean(axis='index') abs_mean_rank = abs_mean_shap.sort_values( ascending=False)[:shap_show_count] for l in abs_mean_rank.index: # Save labels for features in the top-N shap_plot_labels.add(l) # Plot this section's SHAP values plt.sca(beeswarms_axs[idx]) shap.summary_plot(cdf_subset.values, train_subset, max_display=shap_show_count, show=False, color_bar=False, sort=True) min_date = cdf_subset.index.min().to_pydatetime() max_date = cdf_subset.index.max().to_pydatetime( ) + timedelta(days=1) min_date_f = min_date.strftime("%Y/%m/%d") max_date_f = max_date.strftime("%Y/%m/%d") beeswarms_axs[idx].set_xlabel( f"SHAP values\nWindow: {min_date_f} - {max_date_f}", fontsize=8) beeswarms_axs[idx].tick_params(axis='y', which='major', labelsize=6) beeswarms_axs[idx].tick_params(axis='x', which='major', labelsize=8) # Plot shap values day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{label}_Wdays{t.window['days']}_.csv" plot_cls_shap_df = pd.read_csv(day_csv_name, index_col='time', parse_dates=True) def get_spread(series): return np.abs(series.max() - series.min()) plot_rank = plot_cls_shap_df[list(shap_plot_labels)].apply( get_spread, axis='index').sort_values( ascending=False)[:shap_show_count] plot_cls_shap_df['xlabel'] = [ t.to_pydatetime().strftime("%Y/%m/%d") for t in plot_cls_shap_df.index ] shap_ax = plot_cls_shap_df.plot( x='xlabel', y=[c for c in plot_rank.index], kind='line', ax=shap_values_ax, legend=False, xlabel='') patches, labels = shap_ax.get_legend_handles_labels() shap_ax.legend(patches, labels, loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6}) shap_ax.tick_params(axis='x', which='major', labelsize=8) shap_ax.set_ylabel('mean(|SHAP|)', fontsize=6) #shap_ax.tick_params(labelbottom=False, labelleft=False) # Get Metrics scores dataframe cri_df = get_metrics_df(results).rolling( 7, min_periods=1).mean() cri_df['xlabel'] = [ t.to_pydatetime().strftime("%Y/%m/%d") for t in cri_df.index ] cri_ax = cri_df.plot(x='xlabel', y=f"pre_{cls}", kind='line', ax=precision_ax, legend=False, xlabel='') patches, labels = cri_ax.get_legend_handles_labels() cri_ax.legend(patches, labels, loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6}) cri_ax.set_ylabel('mean(precision)', fontsize=6) cri_ax.tick_params(labelbottom=False, labelleft=True) min_date = cri_df.index.min().to_pydatetime().strftime( "%Y/%m/%d") max_date = cri_df.index.max().to_pydatetime().strftime( "%Y/%m/%d") window = t.window['days'] fig.suptitle( f"{symbol}, {pipeline}, W={window}D, Class {label}, From {min_date} to {max_date}" ) # fig.show() os.makedirs(f"images/shap-test-final/", exist_ok=True) plt.savefig( f"images/shap-test-final/{pipeline}_W{window}D_{dataset}_{target}_{symbol}_{label}.png", dpi='figure') plt.close() print(f"{label} OK") print(f"Exported symbol {symbol}.") # # Load day estimator # est = load_estimator() print(f"Plotted {symbol}")
def main(dataset: str, target: str, pipeline: str): hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml") hdf = pd.DataFrame(hierarchy) num_shap_plots = 3 shap_show_count = 10 ds_service = DatasetService() m_service = ModelService() for symbol in SYMBOLS: print(f"Plotting shap dataframes for symbol {symbol}") ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=target) X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features) y_all = ds_service.get_dataset_target(ds=ds, name=target) model = m_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) for t in model.tests: os.makedirs( f"images/shap-test-hierarchy/{dataset}/{target}/{pipeline}/", exist_ok=True) placeholder = "{label}" csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv" expected_csv_name = csv_name.format(label='SHAP_expected') print(f"Loading results for test {t.window}") results = ModelService.parse_test_results(test=t) exp_shap_df = pd.read_csv(expected_csv_name, index_col='time', parse_dates=True) for cls, label in enumerate(["SELL", "HOLD", "BUY"]): class_csv_name = csv_name.format(label=label) cls_shap_df = pd.read_csv(class_csv_name, index_col='time', parse_dates=True) cls_shap_df = cls_shap_df.loc[t.test_interval.begin:t. test_interval.end] x_train = X_all.loc[cls_shap_df.index] chunk_size = int(cls_shap_df.shape[0] / num_shap_plots) # fig = plt.figure(constrained_layout=True, figsize=(100, 50), dpi=300) # # gs = GridSpec(3, num_shap_plots, figure=fig, wspace=1.5, hspace=0.3) # precision_ax = fig.add_subplot(gs[0, :]) # shap_values_ax = fig.add_subplot(gs[1, :]) # beeswarms_axs = [fig.add_subplot(gs[2, i]) for i in range(num_shap_plots)] # #format_axes(fig) # shap_plot_labels = set() # for idx, start in enumerate(range(0, cls_shap_df.shape[0], chunk_size)): # end = start + chunk_size # left = cls_shap_df.shape[0] - end # if left > 0 and left < chunk_size: # end += left # elif left < 0: # break # # Plot each section's SHAP values # cdf_subset = cls_shap_df.iloc[start:end] # train_subset = x_train.iloc[start:end] # # # Get a rank of feature labels based on this section's shap values # abs_mean_shap = cdf_subset.abs().mean(axis='index') # abs_mean_rank = abs_mean_shap.sort_values(ascending=False)[:shap_show_count] # for l in abs_mean_rank.index: # # Save labels for features in the top-N # shap_plot_labels.add(l) # # # Plot this section's SHAP values # plt.sca(beeswarms_axs[idx]) # shap.summary_plot( # cdf_subset.values, # train_subset, # max_display=shap_show_count, # show=False, # color_bar=False, # sort=True # ) # min_date = cdf_subset.index.min().to_pydatetime().strftime("%Y/%m/%d") # max_date = cdf_subset.index.max().to_pydatetime().strftime("%Y/%m/%d") # beeswarms_axs[idx].set_xlabel(f"SHAP values\n{min_date} - {max_date}", fontsize=8) # beeswarms_axs[idx].tick_params(axis='y', which='major', labelsize=6) # beeswarms_axs[idx].tick_params(axis='x', which='major', labelsize=8) # # Plot shap values # plot_cls_shap_df = cls_shap_df.abs().rolling(7, min_periods=1).mean() # def get_spread(series): # return np.abs(series.max() - series.min()) # plot_rank = plot_cls_shap_df[list(shap_plot_labels)].apply(get_spread, axis='index').sort_values(ascending=False)[:shap_show_count] # plot_cls_shap_df['xlabel'] = [t.to_pydatetime().strftime("%Y/%m/%d") for t in plot_cls_shap_df.index] # shap_ax = plot_cls_shap_df.plot( # x='xlabel', # y=[c for c in plot_rank.index], # kind='line', # ax=shap_values_ax, # legend=False, # xlabel='' # ) # patches, labels = shap_ax.get_legend_handles_labels() # shap_ax.legend( # patches, labels, # loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6} # ) # shap_ax.tick_params(axis='x', which='major', labelsize=8) # shap_ax.set_ylabel('mean(|SHAP|)', fontsize=6) # #shap_ax.tick_params(labelbottom=False, labelleft=False) # # # Get Metrics scores dataframe # cri_df = get_metrics_df(results).rolling(7, min_periods=1).mean() # cri_df['xlabel'] = [t.to_pydatetime().strftime("%Y/%m/%d") for t in cri_df.index] # cri_ax = cri_df.plot( # x='xlabel', # y=f"pre_{cls}", # kind='line', # ax=precision_ax, # legend=False, # xlabel='' # ) # patches, labels = cri_ax.get_legend_handles_labels() # cri_ax.legend( # patches, labels, # loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6} # ) # cri_ax.set_ylabel('mean(precision)', fontsize=6) # cri_ax.tick_params(labelbottom=False, labelleft=True) # # min_date = cri_df.index.min().to_pydatetime().strftime("%Y/%m/%d") # max_date = cri_df.index.max().to_pydatetime().strftime("%Y/%m/%d") # fig.suptitle(f"{pipeline}, {symbol}, class {label} tests from {min_date} to {max_date}") # # # fig.show() # plt.savefig( # f"images/shap-test/{pipeline}_{dataset}_{target}_{symbol}_{label}.png", # dpi='figure' # ) # plt.close() print(f"{label} OK") print(f"Exported symbol {symbol}.") # # Load day estimator # est = load_estimator() print(f"Plotted {symbol}")
def main(dataset: str, target: str, pipeline: str): shapes = [] ds_service = DatasetService() m_service = ModelService() for symbol in SYMBOLS: print(f"Exporting shap dataframes for symbol {symbol}") ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=target) X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features) y_all = ds_service.get_dataset_target(ds=ds, name=target) model = m_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) for t in model.tests: os.makedirs( f"data/shap_values/{dataset}/{target}/{pipeline}/daily", exist_ok=True) placeholder = "{label}" csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv" day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/daily/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_" print(f"Loading estimators for test {t.window}") estimators = ModelService.load_test_estimators(model=model, mt=t) results = ModelService.parse_test_results(test=t) shaps = [[], [], []] X_test = X_all.loc[t.test_interval.begin:t.test_interval.end] shap_expected = [] print(f"Calculating shap values") shap_abs_mean = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame()] for est in tqdm(estimators): est_class = y_all.loc[est.day] training_data = est.train_x.astype(np.float64).fillna(value=0) shap_v, shap_exp = get_shap_values(estimator=est.named_steps.c, X=training_data, X_train=training_data, bytes=False) if isinstance(shap_exp, float): shap_expected.append([est.day] + [0, 0, shap_exp]) else: shap_expected.append([est.day] + [v for v in shap_exp]) for cls, label in enumerate(["SELL", "HOLD", "BUY"]): df = pd.DataFrame(shap_v[cls], index=est.train_x.index, columns=est.train_x.columns) # if not shaps[cls]: # If list is empty, append whole df # shaps[cls].append(df) # else: # shaps[cls].append(df.iloc[-1:]) # otherwise only append new row (sliding window) # Save shap values dataframe for each day dayname = est.day.replace('+00:00', '').replace('T', '').replace( ':', '').replace('-', '') day_class_csv_name = day_csv_name.format( label=label) + f"DAY{dayname}.csv" df.to_csv(day_class_csv_name, index_label='time') # Process data for next plot df_abs_mean = df.abs().mean().to_dict() df_abs_mean['time'] = est.day shaps[cls].append(df_abs_mean) # print(shap_abs_mean.head()) # Merge shap values in an unique dataframe and save to csv for each class for cls, label in enumerate(["SELL", "HOLD", "BUY"]): class_csv_name = csv_name.format(label=label) print( f"Exporting dataframe for class {label} -> {class_csv_name}" ) # cdf = pd.concat(shaps[cls], axis='index') cdf = pd.DataFrame.from_records(shaps[cls]) cdf.index = pd.to_datetime(cdf.time) cdf = cdf[cdf.columns.difference(['time'])] cdf.to_csv(class_csv_name, index_label='time') expected_csv_name = csv_name.format(label='SHAP_expected') print( f"Exporting expected values dataframe -> {expected_csv_name}") edf = pd.DataFrame( shap_expected, columns=[ "time", "shap_expected_sell", "shap_expected_hold", "shap_expected_buy" ], ) edf.to_csv(expected_csv_name, index_label='time') print(f"Exported symbol {symbol}.") # # Load day estimator # est = load_estimator() print(f"Plotted {symbol}")
def main(pipeline: str, dataset: str, symbol: str, window: int): ds = DatasetService() ms = ModelService() ts = TradingService() ohlcv_ds = ds.get_dataset('ohlcv', symbol=symbol) asset = ts.get_asset(pipeline=pipeline, dataset=dataset, target='class', symbol=symbol, window=window, create=False) if not asset: print( f"Asset {pipeline}.{dataset}.class for {symbol} on window {window} not found!" ) return test = ms.get_test(pipeline=pipeline, dataset=dataset, target='class', symbol=symbol, window=window) if not test: print( f"Test {pipeline}.{dataset}.class for {symbol} on window {window} not found!" ) # ohlcv = ohlcv.loc[test.test_interval.begin:test.test_interval.end] ohlcv = ds.get_dataset_features(ohlcv_ds, begin=test.test_interval.begin, end=test.test_interval.end) test_results = ModelService.parse_test_results(test).iloc[:-1] enc_label = onehot_target(test_results.label, labels=["is_sell", "is_hold", "is_buy"], fill=False) enc_pred = onehot_target(test_results.predicted, labels=["is_sell", "is_hold", "is_buy"], fill=False) # Mask predictions with low value minus a certain amount signals_level_diff = ohlcv.low * 10 / 100 signals_level = ohlcv.low - signals_level_diff #signals_level = ohlcv.low enc_pred.is_sell.mask(enc_pred.is_sell > 0, other=signals_level, inplace=True) enc_pred.is_hold.mask(enc_pred.is_hold > 0, other=signals_level, inplace=True) enc_pred.is_buy.mask(enc_pred.is_buy > 0, other=signals_level, inplace=True) # Get unique years in index to split plots in smaller scale unique_years = ohlcv.index.year.unique() for year in unique_years: year_pred = enc_pred[enc_pred.index.year == year] year_ohlcv = ohlcv[ohlcv.index.year == year] # Set up xticks daysToIndex = { ts.to_pydatetime(): i for i, ts in enumerate(year_ohlcv.index) } days = [i for i in daysToIndex.values()] labels = [ ts.to_pydatetime().strftime("%Y-%m-%d") for ts in year_ohlcv.index ] # Setup matplotfinance styles and figure s = mpf.make_mpf_style( base_mpf_style='binance') # , rc={'font.size': 6} fig = mpf.figure( figsize=(16, 8), style=s) # pass in the self defined style to the whole canvas fig.suptitle(f"{ohlcv_ds.symbol}, {year}, 1D") ax = fig.add_subplot(3, 1, (1, 2)) # main candle stick chart subplot av = fig.add_subplot(3, 1, 3, sharex=ax) # volume candles subplot # Setup horizontal grids ax.grid(axis='x', color='0.5', linestyle='--') av.grid(axis='x', color='0.5', linestyle='--') # for a in [ax, av]: # a.set_xticks(ticks=days) # a.set_xticklabels(labels=labels) # a.tick_params(axis='x', labelrotation=90) apds = [ # mpf.make_addplot(tcdf) # Predictions mpf.make_addplot(year_ohlcv.close, ax=ax, type='line', color=(0.5, 0.5, 0.5, 0.05)), mpf.make_addplot(year_pred.is_sell, ax=ax, type='scatter', marker='v', color='red'), mpf.make_addplot(year_pred.is_hold, ax=ax, type='scatter', marker='_', color='silver'), mpf.make_addplot(year_pred.is_buy, ax=ax, type='scatter', marker='^', color='lime'), ] mpf.plot( year_ohlcv, type='candle', style=s, #ylabel='Price ($)', ax=ax, volume=av, #ylabel_lower='Volume', show_nontrading=True, addplot=apds, returnfig=True) fig.autofmt_xdate() fig.tight_layout() plt.show() print("Done")
class FeatureSelectionService: def __init__(self): self.model_repo = ModelRepository() self.dataset_service = DatasetService() def create_features_search(self, *, symbol: str, dataset: str, target: str, split: float, method: str, task_key: str = None) -> ModelFeatures: ds = self.dataset_service.get_dataset(dataset, symbol) splits = DatasetService.get_train_test_split_indices(ds, split) result = ModelFeatures(dataset=dataset, target=target, symbol=symbol, search_interval=splits['train'], feature_selection_method=method, task_key=task_key or str(uuid4())) return result def feature_selection(self, mf: ModelFeatures, **kwargs) -> ModelFeatures: # Load dataset X = self.dataset_service.get_features(mf.dataset, mf.symbol, mf.search_interval.begin, mf.search_interval.end, columns=mf.features) y = self.dataset_service.get_target(mf.target, mf.symbol, mf.search_interval.begin, mf.search_interval.end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}]Training data contains less than 2 classes: {}". format(mf.symbol, mf.dataset, mf.target, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Perform search mf.start_at = get_timestamp() # Log starting timestamp if not mf.feature_selection_method or mf.feature_selection_method == 'importances': selector = select_from_model(X, y) mf.feature_importances = label_feature_importances( selector.estimator_, X.columns) elif mf.feature_selection_method == 'importances_cv': selector = select_from_model_cv(X, y) mf.feature_importances = label_feature_importances( selector.estimator_.best_estimator_, X.columns) elif mf.feature_selection_method == 'fscore': selector = select_percentile(X, y, percentile=10) elif mf.feature_selection_method == 'relieff': selector = select_relieff(X, y, percentile=10) elif mf.feature_selection_method == 'multisurf': selector = select_multisurf(X, y, percentile=10) else: raise NotFoundException( "Cannot find feature selection method by {}".format( mf.feature_selection_method)) mf.end_at = get_timestamp() # Log ending timestamp # Update search request with results mf.features = label_support(selector.get_support(), X.columns) # Update model with the new results if kwargs.get('save', True): self.model_repo.append_features_query( { "dataset": mf.dataset, "symbol": mf.symbol, "target": mf.target }, mf) return mf def get_available_symbols(self, dataset: str): return self.dataset_service.get_dataset_symbols(name=dataset) def feature_selection_new(self, *, symbol: str, dataset: str, target: str, split: float, method: str, **kwargs) -> ModelFeatures: ds = self.dataset_service.get_dataset(dataset, symbol) fs_exists = DatasetService.has_feature_selection(ds=ds, method=method, target=target) if fs_exists: if kwargs.get('replace'): self.dataset_service.remove_feature_selection(ds=ds, method=method, target=target) else: if kwargs.get('save'): raise MessageException( f"Feature selection with method '{method}' alrady performed for '{dataset}.{symbol}' and target '{target}'" ) splits = DatasetService.get_train_test_split_indices(ds, split) fs = FeatureSelection(target=target, method=method, search_interval=splits['train'], task_key=kwargs.get('task_key', str(uuid4()))) # Load dataset X = self.dataset_service.get_dataset_features( ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) y = self.dataset_service.get_dataset_target( name=fs.target, ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}]Training data contains less than 2 classes: {}". format(symbol, dataset, target, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Perform search fs.start_at = get_timestamp() # Log starting timestamp if not fs.method or 'importances' in fs.method: if '_cv' in fs.method: selector = select_from_model_cv(X, y) else: selector = select_from_model(X, y) fs.feature_importances = label_feature_importances( selector.estimator_, X.columns) if '_shap' in fs.method: fs.shap_values = get_shap_values( model=selector.estimator_.named_steps.c, X=X, X_train=X) shap_values = parse_shap_values(fs.shap_values) elif fs.method == 'fscore': selector = select_percentile(X, y, percentile=10) elif fs.method == 'relieff': selector = select_relieff(X, y, percentile=10) elif fs.method == 'multisurf': selector = select_multisurf(X, y, percentile=10) else: raise NotFoundException( "Cannot find feature selection method by {}".format(fs.method)) fs.end_at = get_timestamp() # Log ending timestamp # Update search request with results fs.features = label_support(selector.get_support(), X.columns) if not kwargs.get('save'): return fs return self.dataset_service.append_feature_selection(ds, fs)
def main(dataset: str, target: str): # hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml") # hdf = pd.DataFrame(hierarchy) shapes = [] for symbol in SYMBOLS: ds_service = DatasetService() ds = ds_service.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=target) shap_v, shap_exp = parse_shap_values(fs.shap_values) X_train = ds_service.get_dataset_features( ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end #, #columns=fs.features ) shapes.append(X_train.shape[0]) shap_0 = pd.DataFrame(shap_v[0], index=X_train.index, columns=X_train.columns) shap_1 = pd.DataFrame(shap_v[1], index=X_train.index, columns=X_train.columns) shap_2 = pd.DataFrame(shap_v[2], index=X_train.index, columns=X_train.columns) sel_train = X_train[fs.features] sel_shap_0 = shap_0[fs.features] sel_shap_1 = shap_1[fs.features] sel_shap_2 = shap_2[fs.features] show_count = 50 #len(fs.features) shap.summary_plot(sel_shap_0.values, sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"SHAP Summary plot for {symbol}, top {show_count} features for class SELL" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_top{show_count}.png" ) plt.close() shap.summary_plot(sel_shap_1.values, sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"SHAP Summary plot for {symbol}, top {show_count} features for class HOLD" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_top{show_count}.png" ) plt.close() shap.summary_plot(sel_shap_2.values, sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"SHAP Summary plot for {symbol}, top {show_count} features for class BUY" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_top{show_count}.png" ) plt.close() shap.summary_plot(np.abs(sel_shap_0.values), sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class SELL" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_abs_top{show_count}.png" ) plt.close() shap.summary_plot(np.abs(sel_shap_1.values), sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class HOLD" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_abs_top{show_count}.png" ) plt.close() shap.summary_plot(np.abs(sel_shap_2.values), sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class BUY" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_abs_top{show_count}.png" ) plt.close() show_count = 25 shap.summary_plot(sel_shap_0.values, sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"SHAP Summary plot for {symbol}, top {show_count} features for class SELL" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_top{show_count}.png" ) plt.close() shap.summary_plot(sel_shap_1.values, sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"SHAP Summary plot for {symbol}, top {show_count} features for class HOLD" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_top{show_count}.png" ) plt.close() shap.summary_plot(sel_shap_2.values, sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"SHAP Summary plot for {symbol}, top {show_count} features for class BUY" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_top{show_count}.png" ) plt.close() shap.summary_plot(np.abs(sel_shap_0.values), sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class SELL" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_abs_top{show_count}.png" ) plt.close() shap.summary_plot(np.abs(sel_shap_1.values), sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class HOLD" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_abs_top{show_count}.png" ) plt.close() shap.summary_plot(np.abs(sel_shap_2.values), sel_train, max_display=show_count, show=False) plt.tight_layout() plt.title( f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class BUY" ) plt.savefig( f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_abs_top{show_count}.png" ) plt.close() print(f"Plotted {symbol}")
class ModelService: def __init__(self): self.model_repo: ModelRepository = ModelRepository() self.dataset_service = DatasetService() def create_classification_models(self, query, pipeline): ds = DatasetService() models = [] if query is None: query = {{ "type": "FEATURES", }} datasets = ds.query(query) # All possible combinations all_models = {} for d in datasets: # Get targets for this symbol tgt = ds.get_dataset('target', d.symbol) if not d.symbol in all_models: all_models[d.symbol] = [] for t, p in itertools.product(tgt.features, PIPELINE_LIST): if t in ['price', 'pct']: continue all_models[d.symbol].append((d, t, p)) # Method to process a batch of items results = Parallel(n_jobs=-1)( delayed(create_models_batch)(symbol, items) for symbol, items in all_models.items()) return [item for sublist in results for item in sublist] def clear_features(self, query=None): return self.model_repo.clear_features(query or {}) def clear_parameters(self, query=None): return self.model_repo.clear_parameters(query or {}) def clear_tests(self, query=None): return self.model_repo.clear_tests(query or {}) def all(self): return [m for m in self.model_repo.iterable()] @staticmethod def get_model_parameters(m: Model, method: str): for mp in m.parameters: if mp.parameter_search_method == method: return mp return None def remove_parameters(self, model: Model, method: str): found = None for i in range(len(model.parameters)): if model.parameters[i].parameter_search_method == method: found = i if found is not None: del model.parameters[found] self.model_repo.update(model.id, model) return True return False def get_model(self, model_id): return self.model_repo.get(model_id) def get_model(self, pipeline: str, dataset: str, target: str, symbol: str): result = self.model_repo.query({ "symbol": symbol, "dataset": dataset, "target": target, "pipeline": pipeline }) if not result: return None return result[0] def get_test(self, pipeline: str, dataset: str, target: str, symbol: str, window: int): # result = self.model_repo.get_model_test(pipeline, dataset, target, symbol, window) # if not result: # return None # return result[0] model = self.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) for t in model.tests: if t.window['days'] == window: return t return None @staticmethod def parse_test_results(test: ModelTest): if isinstance(test, dict): test = ModelTest(**test) # Re-convert classification results from test to a DataFrame results = pd.DataFrame(test.classification_results) # Parse index so it's a DateTimeIndex, because Mongo stores it as a string results.index = pd.to_datetime(results.time) return results def get_test_results(self, pipeline: str, dataset: str, target: str, symbol: str, window: int): test = self.get_test(pipeline, dataset, target, symbol, window) return ModelService.parse_test_results(test) def query_models(self, query, projection: Optional[dict] = None): return self.model_repo.query(query, projection) def create_model_test(self, *, model: Model, split=0.7, step=None, task_key=None, window=None, **kwargs): service = DatasetService() ds = service.get_dataset(model.dataset, model.symbol) splits = DatasetService.get_train_test_split_indices(ds, split) parameters = kwargs.get('parameters') features = kwargs.get('features') if isinstance(parameters, str) and parameters == 'latest': if model.parameters: parameters = model.parameters[-1].parameters else: parameters = None if isinstance(features, str): fs = DatasetService.get_feature_selection(ds=ds, method=features, target=model.target) if fs: features = fs.features else: features = None result = ModelTest(window=window or {'days': 30}, step=step or ds.interval, parameters=parameters or {}, features=features or [], test_interval=splits['test'], task_key=task_key or str(uuid4())) return result def test_model(self, model: Model, mt: ModelTest, **kwargs): if not model.id: model = self.model_repo.create(model) if self.model_repo.exist_test(model.id, mt.task_key): logging.info("Model {} test {} already executed!".format( model.id, mt.task_key)) return mt # Load dataset ds = DatasetService() d = ds.get_dataset(model.dataset, model.symbol) # Get training data including the first training window begin = sub_interval(timestamp=mt.test_interval.begin, interval=mt.window) end = add_interval(timestamp=mt.test_interval.end, interval=mt.step) if from_timestamp(d.valid_index_min).timestamp() > from_timestamp( begin).timestamp(): raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \ .format(model.pipeline, model.dataset, model.symbol, mt.window)) X = ds.get_features(model.dataset, model.symbol, begin=begin, end=end) y = ds.get_target(model.target, model.symbol, begin=begin, end=end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(model.symbol, model.dataset, model.target, model.pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Load pipeline pipeline_module = get_pipeline(model.pipeline) # Slice testing interval in windows ranges = timestamp_windows(begin, end, mt.window, mt.step) mt.start_at = get_timestamp() df = test_windows(pipeline_module.estimator, mt.parameters, X, y, ranges) mt.end_at = get_timestamp() mt.classification_results = df.to_dict() clf_report = flattened_classification_report_imbalanced( df.label, df.predicted) roc_report = roc_auc_report( df.label, df.predicted, df[[c for c in df.columns if '_proba_' in c]]) clf_report.update(roc_report) mt.classification_report = clf_report self.model_repo.append_test(model.id, mt) return mt def test_model_new(self, *, pipeline: str, dataset: str, symbol: str, target: str, split=0.7, step=None, task_key=None, window=None, **kwargs): test_window = window or {'days': 90} model = self.get_model(pipeline=pipeline, dataset=dataset, symbol=symbol, target=target) # for t in enumerate(model.tests): # if t['window']['days'] == test_window['days']: # if not kwargs.get('force'): # logging.info(f"Model {pipeline}({dataset}.{symbol}) -> {target} " # f"test with window {test_window} already executed!") # if kwargs.get('save'): # return t ds = self.dataset_service.get_dataset(dataset, symbol) splits = DatasetService.get_train_test_split_indices(ds, split) test_interval = splits['test'] test_step = step or ds.interval # Parse model parameters: if it's a string, give it an interpretation parameters = kwargs.get('parameters') features = kwargs.get('features') mp = ModelService.get_model_parameters(m=model, method=parameters) if not mp: logging.warning( f"Parameter search with method {parameters} does not exist in model" f" {model.pipeline}({model.dataset}.{model.symbol}) -> {model.target}" ) # Get training data including the first training window begin = sub_interval(timestamp=test_interval["begin"], interval=test_window) end = add_interval(timestamp=test_interval["end"], interval=test_step) if from_timestamp(ds.valid_index_min).timestamp() > from_timestamp( begin).timestamp(): raise MessageException( f"Not enough data for training with window {test_window}!" f" {model.pipeline}({model.dataset}.{model.symbol}) -> {model.target}" ) test_X, test_y = self.dataset_service.get_x_y(dataset, symbol, target, features, begin, end) # Slice testing interval in "sliding" windows windows = [ (b, e) for b, e in timestamp_windows(begin, end, test_window, test_step) ] # Fit the models and make predictions storage_service.create_bucket(bucket='fit-estimators') _n_jobs = int(kwargs.get('n_jobs', cpu_count() / 2)) logging.info( f"Fitting {len(windows)} estimators with {_n_jobs} threads..") fit_estimators = Parallel(n_jobs=_n_jobs)( delayed(fit_estimator_new)(model=model, mp=mp, features=features, day=e, window=test_window, X=test_X, y=test_y, b=b, e=e, force=not kwargs.get('save')) for b, e in tqdm(windows)) logging.info( f"Saving {len(windows)} fit estimators with {_n_jobs} threads..") estimator_names = Parallel(n_jobs=_n_jobs)( delayed(save_estimator)(estimator=est, ) for est in tqdm(fit_estimators)) # logging.info(f"Loading {len(windows)} estimators with {_n_jobs} threads..") # load_estimators = Parallel(n_jobs=_n_jobs)( # delayed(load_estimator)( # model=model, # day=e, # window=window, # parameters=parameters, # features=features # ) # for b, e in tqdm(windows)) logging.info( f"Predicing {len(windows)} estimators with {_n_jobs} threads..") prediction_results = Parallel(n_jobs=_n_jobs)( delayed(predict_estimator_day)(estimator=est, day=est.day, X=test_X[est.begin:est.end], y=test_y[est.begin:est.end]) for est in tqdm(fit_estimators)) results = [r for r in prediction_results if r is not None] df = pd.DataFrame(results) if df.empty: raise MessageException("TestWindows: Empty result dataframe!") #df.time = pd.to_datetime(df.time) #df = df.set_index('time') classification_records = [r for r in df.to_dict(orient='records')] # If save is true, save test instance and parameters mt = ModelTest( window=test_window, step=test_step, parameters=mp.parameters, features=[c for c in test_X.columns], test_interval=splits['test'], task_key=task_key or str(uuid4()), classification_results=classification_records, ) # Populate classification report fields clf_report = flattened_classification_report_imbalanced( df.label, df.predicted) roc_report = roc_auc_report( df.label, df.predicted, df[[c for c in df.columns if '_proba_' in c]]) clf_report.update(roc_report) mt.classification_report = clf_report # Save test into the model if kwargs.get('save'): return self.model_repo.append_test(model.id, mt) return mt # def get_test_models(self, *, pipeline: str, dataset: str, symbol: str, target: str, split=0.7, step=None, # task_key=None, window=None, **kwargs): # _n_jobs = int(kwargs.get('n_jobs', cpu_count() / 2)) # model = self.get_model(pipeline=pipeline, dataset=dataset, symbol=symbol, target=target) # ds = self.dataset_service.get_dataset(name=model.dataset, symbol=model.symbol) # for t in enumerate(model.tests): # self.dataset_service # estimator_names = Parallel(n_jobs=_n_jobs)( # delayed(load_estimator)( # estimator=est, # model=model, # parameters=t.parameter_search_method, # features=features, # day=day, # window=window # ) # for est in tqdm(fit_estimators)) @staticmethod def load_test_estimators(model: Model, mt: ModelTest, **kwargs): results = ModelService.parse_test_results(mt) test_days = [d for d in results.time] _n_jobs = int(kwargs.get('n_jobs', cpu_count() / 2)) logging.info(f"Loading {len(test_days)} estimators..") estimators = Parallel(n_jobs=_n_jobs)( delayed(load_estimator)(model=model, parameters='gridsearch', features='importances_shap', day=day, window=mt.window) for day in tqdm(test_days)) return estimators def compare_models(self, symbol: str, dataset: str, target: str, pipeline: Optional[str] = None): if pipeline: tests = self.model_repo.find_tests(symbol=symbol, dataset=dataset, target=target, pipeline=pipeline) else: tests = self.model_repo.find_tests(symbol=symbol, dataset=dataset, target=target) return tests def predict_day(self, pipeline: str, dataset: str, target: str, symbol: str, day: str, window: dict): model = self.get_model(pipeline, dataset, target, symbol) # Load dataset ds = DatasetService() d = ds.get_dataset(model.dataset, model.symbol) # Get training data including the first training window begin = sub_interval(timestamp=day, interval=window) if from_timestamp(d.valid_index_min).timestamp() > from_timestamp( begin).timestamp(): raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \ .format(model.pipeline, model.dataset, model.symbol, window)) X = ds.get_features(model.dataset, model.symbol, begin=begin, end=day) y = ds.get_target(model.target, model.symbol, begin=begin, end=day) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(model.symbol, model.dataset, model.target, model.pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) # Load pipeline pipeline_module = get_pipeline(model.pipeline) # Slice testing interval in windows df = predict_day(pipeline_module.estimator, model.parameters[-1], X, y, day) return df
class GridSearchService: def __init__(self): self.model_repo = ModelRepository() self.model_service = ModelService() self.dataset_service = DatasetService() def create_parameters_search(self, model: Model, split: float, **kwargs) -> ModelParameters: ds = self.dataset_service.get_dataset(model.dataset, model.symbol) splits = DatasetService.get_train_test_split_indices(ds, split) # Features can either be a list of features to use, or a string # If it is a string, and it is "latest", pick the latest features = kwargs.get('features') # if isinstance(features, str) and features == 'latest': # if model.features: # features = model.features[-1].features # else: # features = None if features: target = kwargs.get('target', 'class') mf = DatasetService.get_feature_selection( ds=ds, method=kwargs.get('features'), target=target) if not mf: raise MessageException( f"Feature selection not found for {model.dataset}.{model.symbol} -> {target}!" ) features = mf.features # Determine K for K-fold cross validation based on dataset's sample count # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples # so we need X samples where X is given by the proportion: # 30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold X = 40 k = 5 # If samples per fold with 5-fold CV are too low, use 3-folds if ds.count / k < X: k = 3 # If samples are still too low, raise a value error if ds.count / k < X and not kwargs.get("permissive"): raise ValueError("Not enough samples to perform cross validation!") result = ModelParameters(cv_interval=splits['train'], cv_splits=k, task_key=kwargs.get('task_key', str(uuid4())), features=features or None) return result def _get_dataset_and_pipeline(self, model: Model, mp: ModelParameters, **kwargs): if not model.id: # Make sure the task exists model = self.model_repo.create(model) if self.model_repo.exist_parameters(model.id, mp.task_key): logging.info("Model {} Grid search {} already executed!".format( model.id, mp.task_key)) return mp # Load dataset X = self.dataset_service.get_features(model.dataset, model.symbol, mp.cv_interval.begin, mp.cv_interval.end, columns=mp.features) y = self.dataset_service.get_target(model.target, model.symbol, mp.cv_interval.begin, mp.cv_interval.end) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(model.symbol, model.dataset, model.target, model.pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) logging.info("Dataset loaded: X {} y {} (unique: {})".format( X.shape, y.shape, unique)) # Load pipeline pipeline_module = get_pipeline(model.pipeline) return pipeline_module, X, y def grid_search(self, model: Model, mp: ModelParameters, **kwargs) -> ModelParameters: pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp) tag = "{}-{}-{}-{}-{}" \ .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters)) # Perform search if not kwargs.get('halving'): gscv = GridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), cv=StratifiedKFold(n_splits=mp.cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False) else: gscv = HalvingGridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), factor=2, cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", cpu_count() / 2), refit=False, random_state=0) try: mp.start_at = get_timestamp() # Log starting timestamp gscv.fit(X, y) mp.end_at = get_timestamp() # Log ending timestamp except SplitException as e: logging.exception( "Model {} splitting yields single-class folds!\n{}".format( tag, e.message)) return mp # Fit failed, don't save this. except ValueError as e: logging.exception("Model {} raised ValueError!\n{}".format(tag, e)) return mp # Fit failed, don't save this. # Collect results results_df = pd.DataFrame(gscv.cv_results_) # Update search request with results mp.parameter_search_method = 'halving_grid_search' if kwargs.get( 'halving') else 'gridsearch' mp.parameters = gscv.best_params_ mp.cv_results = results_df.to_dict() mp.result_file = 'cv_results-{}.csv'.format(tag) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'grid-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'grid-search-results', mp.result_file) # Update model with the new results self.model_repo.append_parameters(model.id, mp) return mp def random_search(self, model: Model, mp: ModelParameters, **kwargs) -> ModelParameters: pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp) tag = "{}-{}-{}-{}-{}" \ .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters)) rscv = RandomizedSearchCV(estimator=pipeline_module.estimator, param_distributions=kwargs.get( 'param_distributions', pipeline_module.PARAMETER_DISTRIBUTION), n_iter=kwargs.get('n_iter', 10), cv=StratifiedKFold(n_splits=mp.cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False, random_state=0) try: mp.start_at = get_timestamp() # Log starting timestamp rscv.fit(X, y) mp.end_at = get_timestamp() # Log ending timestamp except SplitException as e: logging.exception( "Model {} splitting yields single-class folds!\n{}".format( tag, e.message)) return mp # Fit failed, don't save this. except ValueError as e: logging.exception("Model {} raised ValueError!\n{}".format(tag, e)) return mp # Fit failed, don't save this. # Collect results results_df = pd.DataFrame(rscv.cv_results_) # Update search request with results mp.parameter_search_method = 'randomsearch' mp.parameters = rscv.best_params_ mp.result_file = 'cv_results-{}.csv'.format(tag) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'random-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'random-search-results', mp.result_file) # Update model with the new results self.model_repo.append_parameters(model.id, mp) return mp def grid_search_new(self, symbol: str, dataset: str, target: str, pipeline: str, split: float, feature_selection_method: str, **kwargs): # Check if a model exists and has same search method existing_model = self.model_service.get_model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol) if existing_model: mp_exists = ModelService.get_model_parameters(existing_model, method='gridsearch') if mp_exists: if kwargs.get('replace'): self.model_service.remove_parameters(model=existing_model, method='gridsearch') else: if kwargs.get('save'): raise MessageException( f"Grid search already performed for {pipeline}({dataset}.{symbol}) -> {target}" ) # Retrieve dataset to use ds = self.dataset_service.get_dataset(dataset, symbol) # Determine cv_splits=K for K-fold cross validation based on dataset's sample count # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples # so we need X samples where X is given by the proportion: # 30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold X = 40 cv_splits = 5 # If samples per fold with 5-fold CV are too low, use 3-folds if ds.count / cv_splits < X: cv_splits = 3 # If samples are still too low, raise a value error if ds.count / cv_splits < X and not kwargs.get("permissive"): raise ValueError("Not enough samples to perform cross validation!") # Determine split indices based on dataset splits = DatasetService.get_train_test_split_indices(ds, split) cv_interval = splits['train'] # Load dataset features by applying a specified feature selection method X = self.dataset_service.get_dataset_features( ds=ds, begin=cv_interval['begin'], end=cv_interval['end'], method=feature_selection_method, target=target) y = self.dataset_service.get_target( name=target, symbol=symbol, begin=cv_interval['begin'], end=cv_interval['end'], ) # Check number of samples for each class in training data, if less than 3 instances are present for # each class, we're going to get a very unstable model (or no model at all for k-NN based algos) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: logging.error( "[{}-{}-{}-{}]Training data contains less than 2 classes: {}". format(symbol, dataset, target, pipeline, unique)) raise MessageException( "Training data contains less than 2 classes: {}".format( unique)) logging.info("Dataset loaded: X {} y {} (unique: {})".format( X.shape, y.shape, unique)) # Load pipeline algorithm and parameter grid pipeline_module = get_pipeline(pipeline) # Perform search gscv = GridSearchCV( estimator=pipeline_module.estimator, param_grid=kwargs.get('parameter_grid', pipeline_module.PARAMETER_GRID), # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits), cv=StratifiedKFold(n_splits=cv_splits), scoring=get_precision_scorer(), verbose=kwargs.get("verbose", 0), n_jobs=kwargs.get("n_jobs", None), refit=False) mp = ModelParameters(cv_interval=splits['train'], cv_splits=cv_splits, task_key=kwargs.get('task_key', str(uuid4())), features=[c for c in X.columns], parameter_search_method='gridsearch') mp.start_at = get_timestamp() gscv.fit(X, y) mp.end_at = get_timestamp() # Collect results results_df = pd.DataFrame(gscv.cv_results_) mp.parameters = gscv.best_params_ mp.cv_results = results_df.loc[:, results_df.columns != 'params'].to_dict( 'records') tag = "{}-{}-{}-{}-{}".format(symbol, dataset, target, pipeline, dict_hash(mp.parameters)) mp.result_file = 'cv_results-{}.csv'.format(tag) # Is there an existing model for this search? model = Model(pipeline=pipeline, dataset=dataset, target=target, symbol=symbol, features=feature_selection_method) model.parameters.append(mp) self.model_repo.create(model) # Save grid search results on storage if kwargs.get('save', True): storage_service.upload_json_obj(mp.parameters, 'grid-search-results', 'parameters-{}.json'.format(tag)) storage_service.save_df(results_df, 'grid-search-results', mp.result_file) return mp
def main(pipeline: str, dataset: str, symbol: str, window: int): ds = DatasetService() ms = ModelService() ts = TradingService() ohlcv_ds = ds.get_dataset('ohlcv', symbol=symbol) ohlcv = ds.get_dataset_features( ohlcv_ds) # [ohlcv_ds.valid_index_min:ohlcv_ds.valid_index_max] # boll = pd.Series(percent_b(ohlcv.close, 21), index=ohlcv.index) boll = pd.Series(to_discrete_double(percent_b(ohlcv.close, 21), 20, 80), index=ohlcv.index).replace(to_replace=-1, value=np.nan) #model = ms.get_model(pipeline, dataset, 'class', symbol) _test = ms.get_test(pipeline, dataset, 'class', symbol, window) for test in [ _test ]: # I originally traded all the tests in the model. ToDo: Refactor this. # Re-convert classification results from test to a DataFrame ohlcv_results = ohlcv[test.test_interval.begin:test.test_interval.end] results = ModelService.parse_test_results(test) #results.index = ohlcv_results.index # Parse index so it's a DateTimeIndex, because Mongo stores it as a string # results.index = pd.to_datetime(results.index) asset = ts.get_asset(pipeline=pipeline, dataset=dataset, target='class', symbol=symbol, window=test.window['days']) # Now use classification results to trade! day_count = results.shape[0] cur_day = 0 print( "%B_Precision = {}", precision_score(results.label, boll.loc[results.index], average='macro', zero_division=0)) # Amount to buy in coins for buy and hold: $10k divided by first price in test set bh_price = ohlcv.close.loc[test.test_interval.begin] bh_amount = 10000 / bh_price for index, pred in results.iterrows(): cur_day += 1 # Get simulation day by converting Pandas' Timestamp to our format simulation_day = to_timestamp(index.to_pydatetime()) # Results dataframe interprets values as float, while they are actually int predicted, label = int(pred.predicted), int(pred.label) # Grab ohlcv values for current day try: values = ohlcv.loc[index] except KeyError: print(f"Day: {index} not in OHLCV index!") continue try: boll_sig = boll.loc[ index] if boll.loc[index] != np.nan else None except KeyError: boll_sig = None print(f"Day: {index} not in BOLL index!") pass _index = ohlcv.index.get_loc(index) change = TradingService.get_percent_change(values.close, values.open) print( f"Day {cur_day}/{day_count} [{index}] " f"[O {values.open} H {values.high} L {values.low} C {values.close}] " f"PCT={change}% " f"LABEL={TARGETS[label]} BPRED={TARGETS[boll_sig]} PRED={TARGETS[predicted]}" ) open_positions = ts.get_open_positions(asset=asset, day=simulation_day) for p in open_positions: p_age = TradingService.get_position_age(position=p, day=simulation_day) try: if p.type == 'MARGIN_LONG': if TradingService.check_stop_loss(p, values.low): ts.close_long(asset=asset, day=simulation_day, close_price=p.stop_loss, position=p, detail='Stop Loss') elif TradingService.check_take_profit(p, values.high): ts.close_long(asset=asset, day=simulation_day, close_price=p.take_profit, position=p, detail='Take Profit') elif predicted == SELL: ts.close_long(asset=asset, day=simulation_day, close_price=values.close, position=p, detail='Sell Signal') elif predicted == HOLD and p_age > 86400 * 3: ts.close_long(asset=asset, day=simulation_day, close_price=values.close, position=p, detail='Age') elif predicted == BUY: if change > 0: ts.update_stop_loss(asset=asset, position=p, close_price=values.close, pct=-0.05) elif p.type == 'MARGIN_SHORT': if TradingService.check_stop_loss(p, values.high): ts.close_short(asset=asset, day=simulation_day, close_price=p.stop_loss, position=p, detail='Stop Loss') elif TradingService.check_take_profit(p, values.low): ts.close_short(asset=asset, day=simulation_day, close_price=p.take_profit, position=p, detail='Take Profit') elif predicted == SELL: # If we had some profit and signal is still SELL, book those by lowering stop loss if change < 0: ts.update_stop_loss(asset=asset, position=p, close_price=values.close, pct=0.05) elif predicted == HOLD and p_age > 86400 * 3: ts.close_short(asset=asset, day=simulation_day, close_price=values.close, position=p, detail='Age') elif predicted == BUY: ts.close_short(asset=asset, day=simulation_day, close_price=values.close, position=p, detail='Buy Signal') except MessageException as e: print(f"Order handling exception: {e.message}") try: # If prediction is BUY (price will rise) then open a MARGIN LONG position if predicted == BUY: ts.open_long(asset=asset, day=simulation_day, close_price=values.close, size=0.1, stop_loss=-0.1, take_profit=0.05) # If prediction is SELL (price will drop) open a MARGIN SHORT position elif predicted == SELL: ts.open_short(asset=asset, day=simulation_day, close_price=values.close, size=0.1, stop_loss=0.1, take_profit=-0.05) except MessageException as e: print(f"Order placement exception: {e.message}") # If this is the last trading day of the period, close all open positions if index.timestamp() == results.index[-1].timestamp(): print("Last trading day reached, liquidating all positions..") open_positions = ts.get_open_positions(asset=asset, day=simulation_day) for p in open_positions: try: if p.type == 'MARGIN_LONG': ts.close_long(asset=asset, day=simulation_day, close_price=values.close, position=p, detail='Liquidation') elif p.type == 'MARGIN_SHORT': ts.close_short(asset=asset, day=simulation_day, close_price=values.close, position=p, detail='Liquidation') except MessageException as e: print(f"Order liquidation exception: {e.message}") # Update equity value for the asset ts.update_equity(asset=asset, day=simulation_day, price=values.close) # Update baseline values for the asset ts.update_baseline(asset=asset, day=simulation_day, name='buy_and_hold', value=values.close * bh_amount) print("Timeframe done.")
def main(dataset: str): dss = DatasetService() records = [] for symbol in SYMBOLS: ds = dss.get_dataset(name=dataset, symbol=symbol) fs = DatasetService.get_feature_selection(ds, 'importances_shap', 'class') target = dss.get_dataset_target(ds=ds, name='class') uniq, cnt = np.unique(target, return_counts=True) if cnt[0] + cnt[1] + cnt[2] != ds.count: print(f"Mismatch between classes and count in {symbol}") mindt = from_timestamp(ds.valid_index_min) maxdt = from_timestamp(ds.valid_index_max) daysn = (maxdt - mindt).days records.append({ 'Pair': symbol, 'num_features': len(ds.features), 'sel_features': len(fs.features), 'min_index': ds.valid_index_min, 'max_index': ds.valid_index_max, 'valid_days': daysn, 'records': ds.count, 'sell_count': cnt[0], 'hold_count': cnt[1], 'buy_count': cnt[2] }) df = pd.DataFrame.from_records(records) fig = px.timeline(df, x_start="min_index", x_end="max_index", y="Pair") fig.update_yaxes( autorange="reversed") # otherwise tasks are listed from the bottom up #fig.show() fig.update_layout(title={ 'text': f"Sample distribution across datasets", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top' }, paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.5)', font={'color': 'White'}, margin={ 'l': 5, 'r': 5, 't': 80, 'b': 5, 'pad': 5 }) fig.write_image("images/data_summary/timeline.png") for symbol in SYMBOLS: sdf = df[df.Pair == symbol] pie_values = [ sdf['sell_count'].values[0], sdf['hold_count'].values[0], sdf['buy_count'].values[0] ] pie_labels = ['SELL', 'HOLD', 'BUY'] sfig = go.Figure(data=[ go.Pie( labels=pie_labels, values=pie_values, textinfo='label+percent', #insidetextorientation='radial', showlegend=False) ]) sfig.update_layout(title={ 'text': f"Class distribution for pair {symbol}", 'y': 0.9, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top', 'font': { 'size': 22 } }, paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font={ 'color': 'White', 'size': 26 }, margin={ 'l': 0, 'r': 0, 't': 80, 'b': 0, 'pad': 0 }, uniformtext_minsize=24) sfig.write_image(f"images/data_summary/{symbol}_distribution.png") print(df.head())
def main(dataset: str): ds = DatasetService() ms = ModelService() ts = TradingService() logs = [] for pipeline in PIPELINES: for symbol in SYMBOLS: for window in WINDOWS: print( f"PIPELINE: {pipeline} SYMBOL: {symbol} WINDOW: {window}") ohlcv_ds = ds.get_dataset('ohlcv', symbol=symbol) test = ms.get_test(pipeline=pipeline, dataset=dataset, target='class', symbol=symbol, window=window) if not test: print( f"Test {pipeline}.{dataset}.class for {symbol} on window {window} not found!" ) logs.append( f"MISSING_TEST {pipeline} {dataset} {symbol} class {window} --features importances_shap --parameters gridsearch\n" ) continue asset = ts.get_asset(pipeline=pipeline, dataset=dataset, target='class', symbol=symbol, window=window, create=False) if not asset: print( f"Asset {pipeline}.{dataset}.class for {symbol} on window {window} not found!" ) logs.append( f"MISSING_ASSET {pipeline} {dataset} {symbol} {window}\n" ) continue equity = TradingService.parse_equity_df(asset=asset) buy_and_hold = TradingService.parse_baseline_df( asset=asset, name='buy_and_hold') orders = TradingService.parse_orders_df(asset=asset) # Map order position_id to numbers so we don't get a mess in the graph position_uids = set(orders.position_id.values) for i, uid in enumerate(position_uids): orders.position_id.replace(to_replace=uid, value=i, inplace=True) ohlcv = ds.get_dataset_features(ohlcv_ds, begin=test.test_interval.begin, end=test.test_interval.end) test_results = ModelService.parse_test_results(test).iloc[:-1] # Mask predictions with low value minus a certain amount signals_level_diff = ohlcv.low * 10 / 100 signals_level = ohlcv.low - signals_level_diff enc_pred = onehot_target( test_results.predicted, labels=["is_sell", "is_hold", "is_buy"], fill=False) # In case of classifier bias (due to input bias) some classes are ignored. # In such cases, enc_pred won't contain the ignored classes. # Add them back by nan-filling (never selected) if hasattr(enc_pred, 'is_sell'): use_idx = enc_pred.is_sell > 0 enc_pred.is_sell.mask( use_idx, other=signals_level.loc[enc_pred.index], inplace=True) else: enc_pred['is_sell'] = pd.Series(np.nan, index=enc_pred.index) if hasattr(enc_pred, 'is_hold'): enc_pred.is_hold.mask( enc_pred.is_hold > 0, other=signals_level.loc[enc_pred.index], inplace=True) else: enc_pred['is_hold'] = pd.Series(np.nan, index=enc_pred.index) if hasattr(enc_pred, 'is_buy'): enc_pred.is_buy.mask( enc_pred.is_buy > 0, other=signals_level.loc[enc_pred.index], inplace=True) else: enc_pred['is_buy'] = pd.Series(np.nan, index=enc_pred.index) # Get unique years in index to split plots in smaller scale unique_years = ohlcv.index.year.unique() for year in unique_years: year_ohlcv = ohlcv[ohlcv.index.year == year] year_pred = enc_pred[enc_pred.index.year == year] year_equity = equity[equity.index.year == year] year_buy_and_hodl = buy_and_hold[buy_and_hold.index.year == year] year_orders = orders[orders.index.year == year] unique_quarters = year_ohlcv.index.quarter.unique() for quarter in unique_quarters: q_ohlcv = year_ohlcv[year_ohlcv.index.quarter == quarter] q_pred = year_pred[year_pred.index.quarter == quarter] q_equity = year_equity[year_equity.index.quarter == quarter] q_orders = year_orders[year_orders.index.quarter == quarter] q_buy_and_hodl = year_buy_and_hodl[ year_buy_and_hodl.index.quarter == quarter] #f"{ohlcv_ds.symbol}, {year} - Q{quarter}, 1D", 'Trades', 'Equity' img_path = f"images/backtests-final/{pipeline}-{dataset}-class-W{window}/{symbol}/" img_name = f"trades-{year}-Q{quarter}.png" if os.path.exists(f"{img_path}/{img_name}"): print(f"[SKIP] File exists {img_path}/{img_name}") continue make_plot( ohlcv=q_ohlcv, orders=q_orders, equity=q_equity, baselines=[('Buy and Hold', q_buy_and_hodl)], pred=q_pred, signals_title= f"{ohlcv_ds.symbol}, {pipeline}, W={window}D, {year} - Q{quarter}, 1D", img_path=img_path, img_name=img_name, bollinger=True) print( f"{year}-Q{quarter} saved to {img_path}{img_name}") with open(f"trading_plotly.{dataset}.log", "w") as f: f.writelines(logs) print("Logs saved")