Exemplo n.º 1
0
def main(dataset: str, target: str, symbol: str):
    ds_service = DatasetService()
    ds = ds_service.get_dataset(name=dataset, symbol=symbol)
    fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target='class')
    # hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml", importances=fs.feature_importances)

    # hdf = pd.DataFrame(hierarchy)
    # fig = px.treemap(hdf, path=['category', 'subgroup', 'name'], values='importance')
    # fig.show()
    #
    # fig = px.sunburst(hdf, path=['category', 'subgroup', 'name'], values='importance')
    # fig.show()

    shap_values, shap_expected_values = parse_shap_values(fs.shap_values)
    X = ds_service.get_dataset_features(ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end)
    y = ds_service.get_target(name='class', symbol=symbol, begin=fs.search_interval.begin, end=fs.search_interval.end)
    fig = plt.figure()
    plt.suptitle(f"Shap summary plot for {dataset}.{symbol} -> {target}")
    shap.summary_plot(shap_values, X, class_names=["SELL", "HOLD", "BUY"], show=False, max_display=352, use_log_scale=True)
    plt.tight_layout()
    fig.show()

    shap_dfs = []
    for cls, arr in enumerate(shap_values):
        class_df = pd.DataFrame(arr, columns=X.columns, index=X.index)
        class_df.columns = [f"{c}_class{cls}" for c in class_df.columns]
        shap_dfs.append(class_df)
    shap_df = pd.concat(shap_dfs, axis='columns')
    shap_df = shap_df.reindex(sorted(shap_df.columns), axis=1)
    print(shap_df.head())
Exemplo n.º 2
0
    def create_model_test(self,
                          *,
                          model: Model,
                          split=0.7,
                          step=None,
                          task_key=None,
                          window=None,
                          **kwargs):
        service = DatasetService()
        ds = service.get_dataset(model.dataset, model.symbol)
        splits = DatasetService.get_train_test_split_indices(ds, split)
        parameters = kwargs.get('parameters')
        features = kwargs.get('features')
        if isinstance(parameters, str) and parameters == 'latest':
            if model.parameters:
                parameters = model.parameters[-1].parameters
            else:
                parameters = None

        if isinstance(features, str):
            fs = DatasetService.get_feature_selection(ds=ds,
                                                      method=features,
                                                      target=model.target)
            if fs:
                features = fs.features
            else:
                features = None
        result = ModelTest(window=window or {'days': 30},
                           step=step or ds.interval,
                           parameters=parameters or {},
                           features=features or [],
                           test_interval=splits['test'],
                           task_key=task_key or str(uuid4()))
        return result
Exemplo n.º 3
0
def main():
    models = ModelService()
    datasets = DatasetService()
    query = {
        "dataset": "merged_new",
        "target": "class"
    }
    all_models = models.query_models(query=query)
    for m in all_models:
        ds = datasets.get_dataset(name=m.dataset, symbol=m.symbol)
        fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=m.target)
        if not fs:
            logging.error(f"Dataset {m.dataset}{m.symbol} -> {m.target} does not have feature selection")
            continue

        if not m.parameters:
            logging.error(f"Model {m.pipeline}({m.dataset}{m.symbol}) -> {m.target} does not have parameters")
            continue

        for mp in m.parameters:
            count = 0
            for f in mp.features:
                if not f in fs.features:
                    logging.error(f"Model {m.pipeline}({m.dataset}{m.symbol}) -> {m.target} parameter search done without fixing features!")
                else:
                    count += 1
            logging.info(f"Model {m.pipeline}({m.dataset}{m.symbol}) -> {m.target} GRIDSEARCH {mp.parameter_search_method} done with {count} features")
Exemplo n.º 4
0
    def predict_day(self, pipeline: str, dataset: str, target: str,
                    symbol: str, day: str, window: dict):
        model = self.get_model(pipeline, dataset, target, symbol)
        # Load dataset
        ds = DatasetService()
        d = ds.get_dataset(model.dataset, model.symbol)
        # Get training data including the first training window
        begin = sub_interval(timestamp=day, interval=window)
        if from_timestamp(d.valid_index_min).timestamp() > from_timestamp(
                begin).timestamp():
            raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \
                                   .format(model.pipeline, model.dataset, model.symbol, window))
        X = ds.get_features(model.dataset, model.symbol, begin=begin, end=day)
        y = ds.get_target(model.target, model.symbol, begin=begin, end=day)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}-{}]Training data contains less than 2 classes: {}".
                format(model.symbol, model.dataset, model.target,
                       model.pipeline, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Load pipeline
        pipeline_module = get_pipeline(model.pipeline)
        # Slice testing interval in windows

        df = predict_day(pipeline_module.estimator, model.parameters[-1], X, y,
                         day)

        return df
Exemplo n.º 5
0
    def test_model(self, model: Model, mt: ModelTest, **kwargs):
        if not model.id:
            model = self.model_repo.create(model)
        if self.model_repo.exist_test(model.id, mt.task_key):
            logging.info("Model {} test {} already executed!".format(
                model.id, mt.task_key))
            return mt
        # Load dataset
        ds = DatasetService()
        d = ds.get_dataset(model.dataset, model.symbol)
        # Get training data including the first training window
        begin = sub_interval(timestamp=mt.test_interval.begin,
                             interval=mt.window)
        end = add_interval(timestamp=mt.test_interval.end, interval=mt.step)
        if from_timestamp(d.valid_index_min).timestamp() > from_timestamp(
                begin).timestamp():
            raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \
                                   .format(model.pipeline, model.dataset, model.symbol, mt.window))
        X = ds.get_features(model.dataset, model.symbol, begin=begin, end=end)
        y = ds.get_target(model.target, model.symbol, begin=begin, end=end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}-{}]Training data contains less than 2 classes: {}".
                format(model.symbol, model.dataset, model.target,
                       model.pipeline, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Load pipeline
        pipeline_module = get_pipeline(model.pipeline)
        # Slice testing interval in windows

        ranges = timestamp_windows(begin, end, mt.window, mt.step)

        mt.start_at = get_timestamp()
        df = test_windows(pipeline_module.estimator, mt.parameters, X, y,
                          ranges)
        mt.end_at = get_timestamp()

        mt.classification_results = df.to_dict()

        clf_report = flattened_classification_report_imbalanced(
            df.label, df.predicted)
        roc_report = roc_auc_report(
            df.label, df.predicted,
            df[[c for c in df.columns if '_proba_' in c]])
        clf_report.update(roc_report)
        mt.classification_report = clf_report

        self.model_repo.append_test(model.id, mt)

        return mt
def main(dataset: str):
    ds_service = DatasetService()
    symbols = ds_service.get_dataset_symbols(name=dataset)
    ds_data = {
        s: ds_service.get_dataset(name=dataset, symbol=s).features
        for s in symbols
    }
    # We need to reshape / flatten data
    records = []
    symbol_lookup = {s: i for i, s in enumerate(symbols)}
    for symbol, features in ds_data.items():
        record = {
            'symbol': symbol.replace('USD', ''),
            #'symbol_id': symbol_lookup[symbol]
        }
        for f in features:
            if f.startswith('adrbal1in') and f.endswith('cnt'):
                f = 'adrbal1in{N}cnt'
            elif f.startswith('adrbalntv') and f.endswith('cnt'):
                f = 'adrbalntv{N}cnt'
            elif f.startswith('splyact') and not 'pct' in f:
                f = 'splyact{T}'
            elif f.startswith('splyadrbal1in'):
                f = 'splyadrbal1in{N}'
            elif f.startswith('splyadrbalntv'):
                f = 'splyadrbalntv{N}'
            elif f.startswith('splyadrtop'):
                f = 'splyadrtop{N}'
            elif f.startswith('adrbalusd') and f.endswith('cnt'):
                f = 'adrbalusd{N}cnt'
            elif f.startswith('splyadrbalusd'):
                f = 'splyadrbalusd{N}'
            elif f.startswith('txtfrval') and f.endswith('ntv'):
                f = 'txtfrval{A}ntv'
            elif f.startswith('txtfrval') and f.endswith('usd'):
                f = 'txtfrval{A}usd'
            elif f.startswith('fee') and f.endswith('usd'):
                f = 'fee{A}usd'
            elif f.startswith('gaslmtblk'):
                f = 'gaslmtblk'
            elif f.startswith('gaslmttx'):
                f = 'gaslmttx'
            elif f.startswith('gasusedtx'):
                f = 'gasusedtx'
            elif f.startswith('isccont'):
                f = 'isscont'
            record[f] = 'Y'
        records.append(record)

    result_frame = pd.DataFrame.from_records(records).fillna(value='N')
    #result_frame.set_index(keys='symbol', inplace=True)
    result_frame = result_frame.set_index(keys='symbol').T
    latex = result_frame.to_latex()
    print(result_frame.head())
Exemplo n.º 7
0
def get_dataset(
        symbol: str,
        dataset: Optional[str] = None,
        target: Optional[str] = None,
        begin: Optional[str] = None,
        end: Optional[str] = None,
        service: DatasetService = Depends(DatasetService),
):
    if not dataset and not target:
        raise HTTPException(
            status_code=400,
            detail=
            "At least one of 'dataset' or 'target' parameters must be specified!"
        )
    _name = dataset
    if not _name:
        _name = 'target'
    d = service.get_dataset(name=_name, symbol=symbol)
    # If begin/end not specified, use recorded.
    # If auto use valid.
    if not begin:
        begin = d.index_min
    elif begin == 'auto':
        begin = d.valid_index_min
    if not end:
        end = d.index_max
    elif end == 'auto':
        end = d.valid_index_max
    # Retrieve dataframes
    dfs = []
    if dataset:
        df = service.get_features(name=dataset,
                                  symbol=symbol,
                                  begin=begin,
                                  end=end)
        dfs.append(df)
    if target:
        dfs.append(
            service.get_target(name=target,
                               symbol=symbol,
                               begin=begin,
                               end=end))
    # Concatenate dataframes and target
    res = pd.concat(dfs, axis='columns') if len(dfs) > 1 else dfs[0]
    # Return CSV
    return res.to_csv(index_label='time')
Exemplo n.º 8
0
def main(dataset: str, target: str, pipeline: str):
    shapes = []
    ds_service = DatasetService()
    m_service = ModelService()
    for symbol in SYMBOLS:
        print(f"Exporting shap dataframes for symbol {symbol}")
        ds = ds_service.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds=ds,
                                                  method='importances_shap',
                                                  target=target)
        X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features)
        y_all = ds_service.get_dataset_target(ds=ds, name=target)
        model = m_service.get_model(pipeline=pipeline,
                                    dataset=dataset,
                                    target=target,
                                    symbol=symbol)
        for t in model.tests:
            print(f"Loading estimators for test {t.window}")
            estimators = ModelService.load_test_estimators(model=model, mt=t)
            shaps = []
            print(f"Calculating shap values...")
            for est in tqdm(estimators):
                est_class = y_all.loc[est.day]
                shap_v, shap_exp = get_shap_values(estimator=est,
                                                   X=X_all.loc[est.day],
                                                   X_train=est.train_x,
                                                   bytes=False)
                df = pd.DataFrame([shap_v],
                                  index=[pd.to_datetime(est.day)],
                                  columns=X_all.columns)
                df['label'] = y_all.loc[est.day]
                df['shap_expected'] = shap_exp
                shaps.append(df)
            print("Exporting dataframe..")
            cdf = pd.concat(shaps, axis='index')
            os.makedirs(f"data/shap_values/{dataset}/{target}/{pipeline}/",
                        exist_ok=True)
            cdf.to_csv(
                f"data/shap_values/{dataset}/{target}/{pipeline}/shap_test_{symbol}_Wdays{t.window['days']}.csv",
                index_label='time')
            print("Exported.")
            # # Load day estimator
            # est = load_estimator()

        print(f"Plotted {symbol}")
Exemplo n.º 9
0
def main(dataset: str, target: str, symbol: str):
    ds_service = DatasetService()
    ds = ds_service.get_dataset(name=dataset, symbol=symbol)
    fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target='class')

    # shap_values, shap_expected_values = parse_shap_values(fs.shap_values)
    # X = ds_service.get_dataset_features(ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end)

    # shap_df_0 = pd.DataFrame(data=shap_values[0], index=X.index, columns=X.columns)
    # shap_df_1 = pd.DataFrame(data=shap_values[1], index=X.index, columns=X.columns)
    # shap_df_2 = pd.DataFrame(data=shap_values[2], index=X.index, columns=X.columns)

    hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml", importances=fs.feature_importances)
    # for record in hierarchy:
    #     feature = record['name']
    #     try:
    #         record['shap_mean_0'] = shap_df_0[feature].mean()
    #         record['shap_mean_1'] = shap_df_1[feature].mean()
    #         record['shap_mean_2'] = shap_df_2[feature].mean()
    #     except KeyError as e:
    #         print(f"Feature {feature} not in dataset!")
    #         record['shap_mean_0'] = np.nan
    #         record['shap_mean_1'] = np.nan
    #         record['shap_mean_2'] = np.nan
    #         pass


    os.makedirs(f"data/selection_{dataset}_{target}/", exist_ok=True)

    hdf = pd.DataFrame(hierarchy)
    csv_name = f"data/selection_{dataset}_{target}/{symbol}_feature_importances.csv"
    hdf.to_csv(csv_name, index_label='index')
    print(f"Augmented importances dataframe exported to {csv_name}")

    csv_name = f"data/selection_{dataset}_{target}/{symbol}_feature_importances_selected.csv"
    hdf[hdf.name.isin(fs.features)].to_csv(csv_name, index_label='index')
    print(f"Augmented selected features dataframe exported to {csv_name}")
Exemplo n.º 10
0
 def create_classification_models(self, query, pipeline):
     ds = DatasetService()
     models = []
     if query is None:
         query = {{
             "type": "FEATURES",
         }}
     datasets = ds.query(query)
     # All possible combinations
     all_models = {}
     for d in datasets:
         # Get targets for this symbol
         tgt = ds.get_dataset('target', d.symbol)
         if not d.symbol in all_models:
             all_models[d.symbol] = []
         for t, p in itertools.product(tgt.features, PIPELINE_LIST):
             if t in ['price', 'pct']:
                 continue
             all_models[d.symbol].append((d, t, p))
     # Method to process a batch of items
     results = Parallel(n_jobs=-1)(
         delayed(create_models_batch)(symbol, items)
         for symbol, items in all_models.items())
     return [item for sublist in results for item in sublist]
def main(dataset: str, target: str):
    num_shap_plots = 3
    shap_show_count = 10

    ds_service = DatasetService()
    m_service = ModelService()
    for pipeline in PIPELINES:
        for symbol in SYMBOLS:
            print(
                f"Plotting shap dataframes for pipeline {pipeline} symbol {symbol}"
            )
            ds = ds_service.get_dataset(name=dataset, symbol=symbol)
            fs = DatasetService.get_feature_selection(
                ds=ds, method='importances_shap', target=target)
            X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features)
            y_all = ds_service.get_dataset_target(ds=ds, name=target)
            model = m_service.get_model(pipeline=pipeline,
                                        dataset=dataset,
                                        target=target,
                                        symbol=symbol)
            for t in model.tests:
                placeholder = "{label}"
                csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv"
                expected_csv_name = csv_name.format(label='SHAP_expected')
                print(f"Loading results for test {t.window}")
                results = ModelService.parse_test_results(test=t)
                exp_shap_df = pd.read_csv(expected_csv_name,
                                          index_col='time',
                                          parse_dates=True)
                for cls, label in enumerate(["SELL", "HOLD", "BUY"]):
                    class_csv_name = csv_name.format(label=label)
                    cls_shap_df = pd.read_csv(class_csv_name,
                                              index_col='time',
                                              parse_dates=True)
                    cls_shap_df = cls_shap_df.loc[t.test_interval.begin:t.
                                                  test_interval.end]

                    x_train = X_all.loc[cls_shap_df.index]
                    chunk_size = int(cls_shap_df.shape[0] / num_shap_plots)

                    fig = plt.figure(constrained_layout=True,
                                     figsize=(100, 50),
                                     dpi=300)  #
                    gs = GridSpec(3,
                                  num_shap_plots,
                                  figure=fig,
                                  wspace=1.5,
                                  hspace=0.3)
                    precision_ax = fig.add_subplot(gs[0, :])
                    shap_values_ax = fig.add_subplot(gs[1, :])
                    beeswarms_axs = [
                        fig.add_subplot(gs[2, i])
                        for i in range(num_shap_plots)
                    ]
                    #format_axes(fig)
                    shap_plot_labels = set()
                    first_shap_day = results.iloc[0]['time'].replace(
                        '+00:00',
                        '').replace('T', '').replace(':', '').replace('-', '')
                    middle_shap_day = results.iloc[int(
                        results.shape[0] / 2)]['time'].replace(
                            '+00:00',
                            '').replace('T', '').replace(':',
                                                         '').replace('-', '')
                    last_shap_day = results.iloc[-1]['time'].replace(
                        '+00:00',
                        '').replace('T', '').replace(':', '').replace('-', '')
                    for idx, dayname in enumerate(
                        [first_shap_day, middle_shap_day, last_shap_day]):
                        day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/daily/shap_training_window_{symbol}_{label}_Wdays{t.window['days']}_DAY{dayname}.csv"

                        # Plot each section's SHAP values
                        cdf_subset = pd.read_csv(day_csv_name,
                                                 index_col='time',
                                                 parse_dates=True)
                        train_subset = X_all.loc[cdf_subset.index]

                        # Get a rank of feature labels based on this section's shap values
                        abs_mean_shap = cdf_subset.abs().mean(axis='index')
                        abs_mean_rank = abs_mean_shap.sort_values(
                            ascending=False)[:shap_show_count]
                        for l in abs_mean_rank.index:
                            # Save labels for features in the top-N
                            shap_plot_labels.add(l)

                        # Plot this section's SHAP values
                        plt.sca(beeswarms_axs[idx])
                        shap.summary_plot(cdf_subset.values,
                                          train_subset,
                                          max_display=shap_show_count,
                                          show=False,
                                          color_bar=False,
                                          sort=True)
                        min_date = cdf_subset.index.min().to_pydatetime()
                        max_date = cdf_subset.index.max().to_pydatetime(
                        ) + timedelta(days=1)
                        min_date_f = min_date.strftime("%Y/%m/%d")
                        max_date_f = max_date.strftime("%Y/%m/%d")
                        beeswarms_axs[idx].set_xlabel(
                            f"SHAP values\nWindow: {min_date_f} - {max_date_f}",
                            fontsize=8)
                        beeswarms_axs[idx].tick_params(axis='y',
                                                       which='major',
                                                       labelsize=6)
                        beeswarms_axs[idx].tick_params(axis='x',
                                                       which='major',
                                                       labelsize=8)

                    # Plot shap values
                    day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{label}_Wdays{t.window['days']}_.csv"
                    plot_cls_shap_df = pd.read_csv(day_csv_name,
                                                   index_col='time',
                                                   parse_dates=True)

                    def get_spread(series):
                        return np.abs(series.max() - series.min())

                    plot_rank = plot_cls_shap_df[list(shap_plot_labels)].apply(
                        get_spread, axis='index').sort_values(
                            ascending=False)[:shap_show_count]
                    plot_cls_shap_df['xlabel'] = [
                        t.to_pydatetime().strftime("%Y/%m/%d")
                        for t in plot_cls_shap_df.index
                    ]
                    shap_ax = plot_cls_shap_df.plot(
                        x='xlabel',
                        y=[c for c in plot_rank.index],
                        kind='line',
                        ax=shap_values_ax,
                        legend=False,
                        xlabel='')
                    patches, labels = shap_ax.get_legend_handles_labels()
                    shap_ax.legend(patches,
                                   labels,
                                   loc='center left',
                                   bbox_to_anchor=(1, 0.5),
                                   prop={'size': 6})
                    shap_ax.tick_params(axis='x', which='major', labelsize=8)
                    shap_ax.set_ylabel('mean(|SHAP|)', fontsize=6)
                    #shap_ax.tick_params(labelbottom=False, labelleft=False)

                    # Get Metrics scores dataframe
                    cri_df = get_metrics_df(results).rolling(
                        7, min_periods=1).mean()
                    cri_df['xlabel'] = [
                        t.to_pydatetime().strftime("%Y/%m/%d")
                        for t in cri_df.index
                    ]
                    cri_ax = cri_df.plot(x='xlabel',
                                         y=f"pre_{cls}",
                                         kind='line',
                                         ax=precision_ax,
                                         legend=False,
                                         xlabel='')
                    patches, labels = cri_ax.get_legend_handles_labels()
                    cri_ax.legend(patches,
                                  labels,
                                  loc='center left',
                                  bbox_to_anchor=(1, 0.5),
                                  prop={'size': 6})
                    cri_ax.set_ylabel('mean(precision)', fontsize=6)
                    cri_ax.tick_params(labelbottom=False, labelleft=True)

                    min_date = cri_df.index.min().to_pydatetime().strftime(
                        "%Y/%m/%d")
                    max_date = cri_df.index.max().to_pydatetime().strftime(
                        "%Y/%m/%d")
                    window = t.window['days']
                    fig.suptitle(
                        f"{symbol}, {pipeline}, W={window}D, Class {label}, From {min_date} to {max_date}"
                    )

                    # fig.show()
                    os.makedirs(f"images/shap-test-final/", exist_ok=True)
                    plt.savefig(
                        f"images/shap-test-final/{pipeline}_W{window}D_{dataset}_{target}_{symbol}_{label}.png",
                        dpi='figure')
                    plt.close()
                    print(f"{label} OK")

            print(f"Exported symbol {symbol}.")
            # # Load day estimator
            # est = load_estimator()

        print(f"Plotted {symbol}")
def main(dataset: str, target: str, pipeline: str):
    hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml")
    hdf = pd.DataFrame(hierarchy)

    num_shap_plots = 3
    shap_show_count = 10

    ds_service = DatasetService()
    m_service = ModelService()
    for symbol in SYMBOLS:
        print(f"Plotting shap dataframes for symbol {symbol}")
        ds = ds_service.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds=ds,
                                                  method='importances_shap',
                                                  target=target)
        X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features)
        y_all = ds_service.get_dataset_target(ds=ds, name=target)
        model = m_service.get_model(pipeline=pipeline,
                                    dataset=dataset,
                                    target=target,
                                    symbol=symbol)
        for t in model.tests:
            os.makedirs(
                f"images/shap-test-hierarchy/{dataset}/{target}/{pipeline}/",
                exist_ok=True)
            placeholder = "{label}"
            csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv"
            expected_csv_name = csv_name.format(label='SHAP_expected')
            print(f"Loading results for test {t.window}")
            results = ModelService.parse_test_results(test=t)
            exp_shap_df = pd.read_csv(expected_csv_name,
                                      index_col='time',
                                      parse_dates=True)
            for cls, label in enumerate(["SELL", "HOLD", "BUY"]):
                class_csv_name = csv_name.format(label=label)
                cls_shap_df = pd.read_csv(class_csv_name,
                                          index_col='time',
                                          parse_dates=True)
                cls_shap_df = cls_shap_df.loc[t.test_interval.begin:t.
                                              test_interval.end]

                x_train = X_all.loc[cls_shap_df.index]
                chunk_size = int(cls_shap_df.shape[0] / num_shap_plots)

                # fig = plt.figure(constrained_layout=True, figsize=(100, 50), dpi=300) #
                # gs = GridSpec(3, num_shap_plots, figure=fig, wspace=1.5, hspace=0.3)
                # precision_ax = fig.add_subplot(gs[0, :])
                # shap_values_ax = fig.add_subplot(gs[1, :])
                # beeswarms_axs = [fig.add_subplot(gs[2, i]) for i in range(num_shap_plots)]
                # #format_axes(fig)
                # shap_plot_labels = set()
                # for idx, start in enumerate(range(0, cls_shap_df.shape[0], chunk_size)):
                #     end = start + chunk_size
                #     left = cls_shap_df.shape[0] - end
                #     if left > 0 and left < chunk_size:
                #         end += left
                #     elif left < 0:
                #         break
                #     # Plot each section's SHAP values
                #     cdf_subset = cls_shap_df.iloc[start:end]
                #     train_subset = x_train.iloc[start:end]
                #
                #     # Get a rank of feature labels based on this section's shap values
                #     abs_mean_shap = cdf_subset.abs().mean(axis='index')
                #     abs_mean_rank = abs_mean_shap.sort_values(ascending=False)[:shap_show_count]
                #     for l in abs_mean_rank.index:
                #         # Save labels for features in the top-N
                #         shap_plot_labels.add(l)
                #
                #     # Plot this section's SHAP values
                #     plt.sca(beeswarms_axs[idx])
                #     shap.summary_plot(
                #         cdf_subset.values,
                #         train_subset,
                #         max_display=shap_show_count,
                #         show=False,
                #         color_bar=False,
                #         sort=True
                #     )
                #     min_date = cdf_subset.index.min().to_pydatetime().strftime("%Y/%m/%d")
                #     max_date = cdf_subset.index.max().to_pydatetime().strftime("%Y/%m/%d")
                #     beeswarms_axs[idx].set_xlabel(f"SHAP values\n{min_date} - {max_date}", fontsize=8)
                #     beeswarms_axs[idx].tick_params(axis='y', which='major', labelsize=6)
                #     beeswarms_axs[idx].tick_params(axis='x', which='major', labelsize=8)

                # # Plot shap values
                # plot_cls_shap_df = cls_shap_df.abs().rolling(7, min_periods=1).mean()
                # def get_spread(series):
                #     return np.abs(series.max() - series.min())
                # plot_rank = plot_cls_shap_df[list(shap_plot_labels)].apply(get_spread, axis='index').sort_values(ascending=False)[:shap_show_count]
                # plot_cls_shap_df['xlabel'] = [t.to_pydatetime().strftime("%Y/%m/%d") for t in plot_cls_shap_df.index]
                # shap_ax = plot_cls_shap_df.plot(
                #     x='xlabel',
                #     y=[c for c in plot_rank.index],
                #     kind='line',
                #     ax=shap_values_ax,
                #     legend=False,
                #     xlabel=''
                # )
                # patches, labels = shap_ax.get_legend_handles_labels()
                # shap_ax.legend(
                #     patches, labels,
                #     loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6}
                # )
                # shap_ax.tick_params(axis='x', which='major', labelsize=8)
                # shap_ax.set_ylabel('mean(|SHAP|)', fontsize=6)
                # #shap_ax.tick_params(labelbottom=False, labelleft=False)
                #
                # # Get Metrics scores dataframe
                # cri_df = get_metrics_df(results).rolling(7, min_periods=1).mean()
                # cri_df['xlabel'] = [t.to_pydatetime().strftime("%Y/%m/%d") for t in cri_df.index]
                # cri_ax = cri_df.plot(
                #     x='xlabel',
                #     y=f"pre_{cls}",
                #     kind='line',
                #     ax=precision_ax,
                #     legend=False,
                #     xlabel=''
                # )
                # patches, labels = cri_ax.get_legend_handles_labels()
                # cri_ax.legend(
                #     patches, labels,
                #     loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6}
                # )
                # cri_ax.set_ylabel('mean(precision)', fontsize=6)
                # cri_ax.tick_params(labelbottom=False, labelleft=True)
                #
                # min_date = cri_df.index.min().to_pydatetime().strftime("%Y/%m/%d")
                # max_date = cri_df.index.max().to_pydatetime().strftime("%Y/%m/%d")
                # fig.suptitle(f"{pipeline}, {symbol}, class {label} tests from {min_date} to {max_date}")
                #
                # # fig.show()
                # plt.savefig(
                #     f"images/shap-test/{pipeline}_{dataset}_{target}_{symbol}_{label}.png",
                #     dpi='figure'
                # )
                # plt.close()
                print(f"{label} OK")

            print(f"Exported symbol {symbol}.")
            # # Load day estimator
            # est = load_estimator()

        print(f"Plotted {symbol}")
Exemplo n.º 13
0
def main(dataset: str, target: str, pipeline: str):
    shapes = []
    ds_service = DatasetService()
    m_service = ModelService()
    for symbol in SYMBOLS:
        print(f"Exporting shap dataframes for symbol {symbol}")
        ds = ds_service.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds=ds,
                                                  method='importances_shap',
                                                  target=target)
        X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features)
        y_all = ds_service.get_dataset_target(ds=ds, name=target)
        model = m_service.get_model(pipeline=pipeline,
                                    dataset=dataset,
                                    target=target,
                                    symbol=symbol)
        for t in model.tests:
            os.makedirs(
                f"data/shap_values/{dataset}/{target}/{pipeline}/daily",
                exist_ok=True)
            placeholder = "{label}"
            csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv"
            day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/daily/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_"
            print(f"Loading estimators for test {t.window}")
            estimators = ModelService.load_test_estimators(model=model, mt=t)
            results = ModelService.parse_test_results(test=t)
            shaps = [[], [], []]

            X_test = X_all.loc[t.test_interval.begin:t.test_interval.end]
            shap_expected = []
            print(f"Calculating shap values")
            shap_abs_mean = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame()]
            for est in tqdm(estimators):
                est_class = y_all.loc[est.day]
                training_data = est.train_x.astype(np.float64).fillna(value=0)

                shap_v, shap_exp = get_shap_values(estimator=est.named_steps.c,
                                                   X=training_data,
                                                   X_train=training_data,
                                                   bytes=False)

                if isinstance(shap_exp, float):
                    shap_expected.append([est.day] + [0, 0, shap_exp])
                else:
                    shap_expected.append([est.day] + [v for v in shap_exp])
                for cls, label in enumerate(["SELL", "HOLD", "BUY"]):
                    df = pd.DataFrame(shap_v[cls],
                                      index=est.train_x.index,
                                      columns=est.train_x.columns)
                    # if not shaps[cls]: # If list is empty, append whole df
                    #     shaps[cls].append(df)
                    # else:
                    #     shaps[cls].append(df.iloc[-1:])  # otherwise only append new row (sliding window)
                    # Save shap values dataframe for each day
                    dayname = est.day.replace('+00:00',
                                              '').replace('T', '').replace(
                                                  ':', '').replace('-', '')
                    day_class_csv_name = day_csv_name.format(
                        label=label) + f"DAY{dayname}.csv"
                    df.to_csv(day_class_csv_name, index_label='time')

                    # Process data for next plot
                    df_abs_mean = df.abs().mean().to_dict()
                    df_abs_mean['time'] = est.day
                    shaps[cls].append(df_abs_mean)

                    # print(shap_abs_mean.head())

            # Merge shap values in an unique dataframe and save to csv for each class
            for cls, label in enumerate(["SELL", "HOLD", "BUY"]):
                class_csv_name = csv_name.format(label=label)
                print(
                    f"Exporting dataframe for class {label} -> {class_csv_name}"
                )
                # cdf = pd.concat(shaps[cls], axis='index')
                cdf = pd.DataFrame.from_records(shaps[cls])
                cdf.index = pd.to_datetime(cdf.time)
                cdf = cdf[cdf.columns.difference(['time'])]
                cdf.to_csv(class_csv_name, index_label='time')

            expected_csv_name = csv_name.format(label='SHAP_expected')
            print(
                f"Exporting expected values dataframe -> {expected_csv_name}")
            edf = pd.DataFrame(
                shap_expected,
                columns=[
                    "time", "shap_expected_sell", "shap_expected_hold",
                    "shap_expected_buy"
                ],
            )
            edf.to_csv(expected_csv_name, index_label='time')

            print(f"Exported symbol {symbol}.")
            # # Load day estimator
            # est = load_estimator()

        print(f"Plotted {symbol}")
Exemplo n.º 14
0
def main(pipeline: str, dataset: str, symbol: str, window: int):
    ds = DatasetService()
    ms = ModelService()
    ts = TradingService()
    ohlcv_ds = ds.get_dataset('ohlcv', symbol=symbol)
    asset = ts.get_asset(pipeline=pipeline,
                         dataset=dataset,
                         target='class',
                         symbol=symbol,
                         window=window,
                         create=False)
    if not asset:
        print(
            f"Asset {pipeline}.{dataset}.class for {symbol} on window {window} not found!"
        )
        return
    test = ms.get_test(pipeline=pipeline,
                       dataset=dataset,
                       target='class',
                       symbol=symbol,
                       window=window)
    if not test:
        print(
            f"Test {pipeline}.{dataset}.class for {symbol} on window {window} not found!"
        )
    # ohlcv = ohlcv.loc[test.test_interval.begin:test.test_interval.end]
    ohlcv = ds.get_dataset_features(ohlcv_ds,
                                    begin=test.test_interval.begin,
                                    end=test.test_interval.end)
    test_results = ModelService.parse_test_results(test).iloc[:-1]
    enc_label = onehot_target(test_results.label,
                              labels=["is_sell", "is_hold", "is_buy"],
                              fill=False)
    enc_pred = onehot_target(test_results.predicted,
                             labels=["is_sell", "is_hold", "is_buy"],
                             fill=False)

    # Mask predictions with low value minus a certain amount
    signals_level_diff = ohlcv.low * 10 / 100
    signals_level = ohlcv.low - signals_level_diff
    #signals_level = ohlcv.low
    enc_pred.is_sell.mask(enc_pred.is_sell > 0,
                          other=signals_level,
                          inplace=True)
    enc_pred.is_hold.mask(enc_pred.is_hold > 0,
                          other=signals_level,
                          inplace=True)
    enc_pred.is_buy.mask(enc_pred.is_buy > 0,
                         other=signals_level,
                         inplace=True)

    # Get unique years in index to split plots in smaller scale
    unique_years = ohlcv.index.year.unique()
    for year in unique_years:
        year_pred = enc_pred[enc_pred.index.year == year]
        year_ohlcv = ohlcv[ohlcv.index.year == year]

        # Set up xticks
        daysToIndex = {
            ts.to_pydatetime(): i
            for i, ts in enumerate(year_ohlcv.index)
        }
        days = [i for i in daysToIndex.values()]
        labels = [
            ts.to_pydatetime().strftime("%Y-%m-%d") for ts in year_ohlcv.index
        ]

        # Setup matplotfinance styles and figure
        s = mpf.make_mpf_style(
            base_mpf_style='binance')  # , rc={'font.size': 6}
        fig = mpf.figure(
            figsize=(16, 8),
            style=s)  # pass in the self defined style to the whole canvas
        fig.suptitle(f"{ohlcv_ds.symbol}, {year}, 1D")

        ax = fig.add_subplot(3, 1, (1, 2))  # main candle stick chart subplot
        av = fig.add_subplot(3, 1, 3, sharex=ax)  # volume candles subplot

        # Setup horizontal grids
        ax.grid(axis='x', color='0.5', linestyle='--')
        av.grid(axis='x', color='0.5', linestyle='--')

        # for a in [ax, av]:
        #     a.set_xticks(ticks=days)
        #     a.set_xticklabels(labels=labels)
        #     a.tick_params(axis='x', labelrotation=90)

        apds = [
            #     mpf.make_addplot(tcdf)
            # Predictions
            mpf.make_addplot(year_ohlcv.close,
                             ax=ax,
                             type='line',
                             color=(0.5, 0.5, 0.5, 0.05)),
            mpf.make_addplot(year_pred.is_sell,
                             ax=ax,
                             type='scatter',
                             marker='v',
                             color='red'),
            mpf.make_addplot(year_pred.is_hold,
                             ax=ax,
                             type='scatter',
                             marker='_',
                             color='silver'),
            mpf.make_addplot(year_pred.is_buy,
                             ax=ax,
                             type='scatter',
                             marker='^',
                             color='lime'),
        ]

        mpf.plot(
            year_ohlcv,
            type='candle',
            style=s,
            #ylabel='Price ($)',
            ax=ax,
            volume=av,
            #ylabel_lower='Volume',
            show_nontrading=True,
            addplot=apds,
            returnfig=True)
        fig.autofmt_xdate()
        fig.tight_layout()
        plt.show()
        print("Done")
Exemplo n.º 15
0
class FeatureSelectionService:
    def __init__(self):
        self.model_repo = ModelRepository()
        self.dataset_service = DatasetService()

    def create_features_search(self,
                               *,
                               symbol: str,
                               dataset: str,
                               target: str,
                               split: float,
                               method: str,
                               task_key: str = None) -> ModelFeatures:
        ds = self.dataset_service.get_dataset(dataset, symbol)
        splits = DatasetService.get_train_test_split_indices(ds, split)
        result = ModelFeatures(dataset=dataset,
                               target=target,
                               symbol=symbol,
                               search_interval=splits['train'],
                               feature_selection_method=method,
                               task_key=task_key or str(uuid4()))
        return result

    def feature_selection(self, mf: ModelFeatures, **kwargs) -> ModelFeatures:

        # Load dataset
        X = self.dataset_service.get_features(mf.dataset,
                                              mf.symbol,
                                              mf.search_interval.begin,
                                              mf.search_interval.end,
                                              columns=mf.features)
        y = self.dataset_service.get_target(mf.target, mf.symbol,
                                            mf.search_interval.begin,
                                            mf.search_interval.end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}]Training data contains less than 2 classes: {}".
                format(mf.symbol, mf.dataset, mf.target, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Perform search
        mf.start_at = get_timestamp()  # Log starting timestamp
        if not mf.feature_selection_method or mf.feature_selection_method == 'importances':
            selector = select_from_model(X, y)
            mf.feature_importances = label_feature_importances(
                selector.estimator_, X.columns)
        elif mf.feature_selection_method == 'importances_cv':
            selector = select_from_model_cv(X, y)
            mf.feature_importances = label_feature_importances(
                selector.estimator_.best_estimator_, X.columns)
        elif mf.feature_selection_method == 'fscore':
            selector = select_percentile(X, y, percentile=10)
        elif mf.feature_selection_method == 'relieff':
            selector = select_relieff(X, y, percentile=10)
        elif mf.feature_selection_method == 'multisurf':
            selector = select_multisurf(X, y, percentile=10)
        else:
            raise NotFoundException(
                "Cannot find feature selection method by {}".format(
                    mf.feature_selection_method))
        mf.end_at = get_timestamp()  # Log ending timestamp

        # Update search request with results
        mf.features = label_support(selector.get_support(), X.columns)

        # Update model with the new results
        if kwargs.get('save', True):
            self.model_repo.append_features_query(
                {
                    "dataset": mf.dataset,
                    "symbol": mf.symbol,
                    "target": mf.target
                }, mf)
        return mf

    def get_available_symbols(self, dataset: str):
        return self.dataset_service.get_dataset_symbols(name=dataset)

    def feature_selection_new(self, *, symbol: str, dataset: str, target: str,
                              split: float, method: str,
                              **kwargs) -> ModelFeatures:
        ds = self.dataset_service.get_dataset(dataset, symbol)
        fs_exists = DatasetService.has_feature_selection(ds=ds,
                                                         method=method,
                                                         target=target)
        if fs_exists:
            if kwargs.get('replace'):
                self.dataset_service.remove_feature_selection(ds=ds,
                                                              method=method,
                                                              target=target)
            else:
                if kwargs.get('save'):
                    raise MessageException(
                        f"Feature selection with method '{method}' alrady performed for '{dataset}.{symbol}' and target '{target}'"
                    )

        splits = DatasetService.get_train_test_split_indices(ds, split)
        fs = FeatureSelection(target=target,
                              method=method,
                              search_interval=splits['train'],
                              task_key=kwargs.get('task_key', str(uuid4())))

        # Load dataset
        X = self.dataset_service.get_dataset_features(
            ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end)
        y = self.dataset_service.get_dataset_target(
            name=fs.target,
            ds=ds,
            begin=fs.search_interval.begin,
            end=fs.search_interval.end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}]Training data contains less than 2 classes: {}".
                format(symbol, dataset, target, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Perform search
        fs.start_at = get_timestamp()  # Log starting timestamp
        if not fs.method or 'importances' in fs.method:
            if '_cv' in fs.method:
                selector = select_from_model_cv(X, y)
            else:
                selector = select_from_model(X, y)
            fs.feature_importances = label_feature_importances(
                selector.estimator_, X.columns)
            if '_shap' in fs.method:
                fs.shap_values = get_shap_values(
                    model=selector.estimator_.named_steps.c, X=X, X_train=X)
                shap_values = parse_shap_values(fs.shap_values)
        elif fs.method == 'fscore':
            selector = select_percentile(X, y, percentile=10)
        elif fs.method == 'relieff':
            selector = select_relieff(X, y, percentile=10)
        elif fs.method == 'multisurf':
            selector = select_multisurf(X, y, percentile=10)
        else:
            raise NotFoundException(
                "Cannot find feature selection method by {}".format(fs.method))
        fs.end_at = get_timestamp()  # Log ending timestamp

        # Update search request with results
        fs.features = label_support(selector.get_support(), X.columns)

        if not kwargs.get('save'):
            return fs
        return self.dataset_service.append_feature_selection(ds, fs)
Exemplo n.º 16
0
def main(dataset: str, target: str):
    # hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml")
    # hdf = pd.DataFrame(hierarchy)

    shapes = []
    for symbol in SYMBOLS:
        ds_service = DatasetService()
        ds = ds_service.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds=ds,
                                                  method='importances_shap',
                                                  target=target)
        shap_v, shap_exp = parse_shap_values(fs.shap_values)

        X_train = ds_service.get_dataset_features(
            ds=ds,
            begin=fs.search_interval.begin,
            end=fs.search_interval.end  #,
            #columns=fs.features
        )
        shapes.append(X_train.shape[0])

        shap_0 = pd.DataFrame(shap_v[0],
                              index=X_train.index,
                              columns=X_train.columns)
        shap_1 = pd.DataFrame(shap_v[1],
                              index=X_train.index,
                              columns=X_train.columns)
        shap_2 = pd.DataFrame(shap_v[2],
                              index=X_train.index,
                              columns=X_train.columns)

        sel_train = X_train[fs.features]
        sel_shap_0 = shap_0[fs.features]
        sel_shap_1 = shap_1[fs.features]
        sel_shap_2 = shap_2[fs.features]

        show_count = 50  #len(fs.features)
        shap.summary_plot(sel_shap_0.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class SELL"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(sel_shap_1.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class HOLD"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(sel_shap_2.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class BUY"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_0.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class SELL"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_abs_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_1.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class HOLD"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_abs_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_2.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class BUY"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_abs_top{show_count}.png"
        )
        plt.close()

        show_count = 25
        shap.summary_plot(sel_shap_0.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class SELL"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(sel_shap_1.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class HOLD"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(sel_shap_2.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class BUY"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_0.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class SELL"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_abs_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_1.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class HOLD"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_abs_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_2.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class BUY"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_abs_top{show_count}.png"
        )
        plt.close()

        print(f"Plotted {symbol}")
Exemplo n.º 17
0
class ModelService:
    def __init__(self):
        self.model_repo: ModelRepository = ModelRepository()
        self.dataset_service = DatasetService()

    def create_classification_models(self, query, pipeline):
        ds = DatasetService()
        models = []
        if query is None:
            query = {{
                "type": "FEATURES",
            }}
        datasets = ds.query(query)
        # All possible combinations
        all_models = {}
        for d in datasets:
            # Get targets for this symbol
            tgt = ds.get_dataset('target', d.symbol)
            if not d.symbol in all_models:
                all_models[d.symbol] = []
            for t, p in itertools.product(tgt.features, PIPELINE_LIST):
                if t in ['price', 'pct']:
                    continue
                all_models[d.symbol].append((d, t, p))
        # Method to process a batch of items
        results = Parallel(n_jobs=-1)(
            delayed(create_models_batch)(symbol, items)
            for symbol, items in all_models.items())
        return [item for sublist in results for item in sublist]

    def clear_features(self, query=None):
        return self.model_repo.clear_features(query or {})

    def clear_parameters(self, query=None):
        return self.model_repo.clear_parameters(query or {})

    def clear_tests(self, query=None):
        return self.model_repo.clear_tests(query or {})

    def all(self):
        return [m for m in self.model_repo.iterable()]

    @staticmethod
    def get_model_parameters(m: Model, method: str):
        for mp in m.parameters:
            if mp.parameter_search_method == method:
                return mp
        return None

    def remove_parameters(self, model: Model, method: str):
        found = None
        for i in range(len(model.parameters)):
            if model.parameters[i].parameter_search_method == method:
                found = i
        if found is not None:
            del model.parameters[found]
            self.model_repo.update(model.id, model)
            return True
        return False

    def get_model(self, model_id):
        return self.model_repo.get(model_id)

    def get_model(self, pipeline: str, dataset: str, target: str, symbol: str):
        result = self.model_repo.query({
            "symbol": symbol,
            "dataset": dataset,
            "target": target,
            "pipeline": pipeline
        })
        if not result:
            return None
        return result[0]

    def get_test(self, pipeline: str, dataset: str, target: str, symbol: str,
                 window: int):
        # result = self.model_repo.get_model_test(pipeline, dataset, target, symbol, window)
        # if not result:
        #     return None
        # return result[0]
        model = self.get_model(pipeline=pipeline,
                               dataset=dataset,
                               target=target,
                               symbol=symbol)
        for t in model.tests:
            if t.window['days'] == window:
                return t
        return None

    @staticmethod
    def parse_test_results(test: ModelTest):
        if isinstance(test, dict):
            test = ModelTest(**test)
        # Re-convert classification results from test to a DataFrame
        results = pd.DataFrame(test.classification_results)
        # Parse index so it's a DateTimeIndex, because Mongo stores it as a string
        results.index = pd.to_datetime(results.time)
        return results

    def get_test_results(self, pipeline: str, dataset: str, target: str,
                         symbol: str, window: int):
        test = self.get_test(pipeline, dataset, target, symbol, window)
        return ModelService.parse_test_results(test)

    def query_models(self, query, projection: Optional[dict] = None):
        return self.model_repo.query(query, projection)

    def create_model_test(self,
                          *,
                          model: Model,
                          split=0.7,
                          step=None,
                          task_key=None,
                          window=None,
                          **kwargs):
        service = DatasetService()
        ds = service.get_dataset(model.dataset, model.symbol)
        splits = DatasetService.get_train_test_split_indices(ds, split)
        parameters = kwargs.get('parameters')
        features = kwargs.get('features')
        if isinstance(parameters, str) and parameters == 'latest':
            if model.parameters:
                parameters = model.parameters[-1].parameters
            else:
                parameters = None

        if isinstance(features, str):
            fs = DatasetService.get_feature_selection(ds=ds,
                                                      method=features,
                                                      target=model.target)
            if fs:
                features = fs.features
            else:
                features = None
        result = ModelTest(window=window or {'days': 30},
                           step=step or ds.interval,
                           parameters=parameters or {},
                           features=features or [],
                           test_interval=splits['test'],
                           task_key=task_key or str(uuid4()))
        return result

    def test_model(self, model: Model, mt: ModelTest, **kwargs):
        if not model.id:
            model = self.model_repo.create(model)
        if self.model_repo.exist_test(model.id, mt.task_key):
            logging.info("Model {} test {} already executed!".format(
                model.id, mt.task_key))
            return mt
        # Load dataset
        ds = DatasetService()
        d = ds.get_dataset(model.dataset, model.symbol)
        # Get training data including the first training window
        begin = sub_interval(timestamp=mt.test_interval.begin,
                             interval=mt.window)
        end = add_interval(timestamp=mt.test_interval.end, interval=mt.step)
        if from_timestamp(d.valid_index_min).timestamp() > from_timestamp(
                begin).timestamp():
            raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \
                                   .format(model.pipeline, model.dataset, model.symbol, mt.window))
        X = ds.get_features(model.dataset, model.symbol, begin=begin, end=end)
        y = ds.get_target(model.target, model.symbol, begin=begin, end=end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}-{}]Training data contains less than 2 classes: {}".
                format(model.symbol, model.dataset, model.target,
                       model.pipeline, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Load pipeline
        pipeline_module = get_pipeline(model.pipeline)
        # Slice testing interval in windows

        ranges = timestamp_windows(begin, end, mt.window, mt.step)

        mt.start_at = get_timestamp()
        df = test_windows(pipeline_module.estimator, mt.parameters, X, y,
                          ranges)
        mt.end_at = get_timestamp()

        mt.classification_results = df.to_dict()

        clf_report = flattened_classification_report_imbalanced(
            df.label, df.predicted)
        roc_report = roc_auc_report(
            df.label, df.predicted,
            df[[c for c in df.columns if '_proba_' in c]])
        clf_report.update(roc_report)
        mt.classification_report = clf_report

        self.model_repo.append_test(model.id, mt)

        return mt

    def test_model_new(self,
                       *,
                       pipeline: str,
                       dataset: str,
                       symbol: str,
                       target: str,
                       split=0.7,
                       step=None,
                       task_key=None,
                       window=None,
                       **kwargs):
        test_window = window or {'days': 90}
        model = self.get_model(pipeline=pipeline,
                               dataset=dataset,
                               symbol=symbol,
                               target=target)
        # for t in enumerate(model.tests):
        #     if t['window']['days'] == test_window['days']:
        #         if not kwargs.get('force'):
        #             logging.info(f"Model {pipeline}({dataset}.{symbol}) -> {target} "
        #                          f"test with window {test_window} already executed!")
        #             if kwargs.get('save'):
        #                 return t

        ds = self.dataset_service.get_dataset(dataset, symbol)
        splits = DatasetService.get_train_test_split_indices(ds, split)
        test_interval = splits['test']
        test_step = step or ds.interval

        # Parse model parameters: if it's a string, give it an interpretation
        parameters = kwargs.get('parameters')
        features = kwargs.get('features')
        mp = ModelService.get_model_parameters(m=model, method=parameters)
        if not mp:
            logging.warning(
                f"Parameter search with method {parameters} does not exist in model"
                f" {model.pipeline}({model.dataset}.{model.symbol}) -> {model.target}"
            )

        # Get training data including the first training window
        begin = sub_interval(timestamp=test_interval["begin"],
                             interval=test_window)
        end = add_interval(timestamp=test_interval["end"], interval=test_step)
        if from_timestamp(ds.valid_index_min).timestamp() > from_timestamp(
                begin).timestamp():
            raise MessageException(
                f"Not enough data for training with window {test_window}!"
                f" {model.pipeline}({model.dataset}.{model.symbol}) -> {model.target}"
            )
        test_X, test_y = self.dataset_service.get_x_y(dataset, symbol, target,
                                                      features, begin, end)
        # Slice testing interval in "sliding" windows
        windows = [
            (b, e)
            for b, e in timestamp_windows(begin, end, test_window, test_step)
        ]

        # Fit the models and make predictions
        storage_service.create_bucket(bucket='fit-estimators')

        _n_jobs = int(kwargs.get('n_jobs', cpu_count() / 2))
        logging.info(
            f"Fitting {len(windows)} estimators with {_n_jobs} threads..")
        fit_estimators = Parallel(n_jobs=_n_jobs)(
            delayed(fit_estimator_new)(model=model,
                                       mp=mp,
                                       features=features,
                                       day=e,
                                       window=test_window,
                                       X=test_X,
                                       y=test_y,
                                       b=b,
                                       e=e,
                                       force=not kwargs.get('save'))
            for b, e in tqdm(windows))

        logging.info(
            f"Saving {len(windows)} fit estimators with {_n_jobs} threads..")
        estimator_names = Parallel(n_jobs=_n_jobs)(
            delayed(save_estimator)(estimator=est, )
            for est in tqdm(fit_estimators))

        # logging.info(f"Loading {len(windows)} estimators with {_n_jobs} threads..")
        # load_estimators = Parallel(n_jobs=_n_jobs)(
        #     delayed(load_estimator)(
        #         model=model,
        #         day=e,
        #         window=window,
        #         parameters=parameters,
        #         features=features
        #     )
        #     for b, e in tqdm(windows))

        logging.info(
            f"Predicing {len(windows)} estimators with {_n_jobs} threads..")
        prediction_results = Parallel(n_jobs=_n_jobs)(
            delayed(predict_estimator_day)(estimator=est,
                                           day=est.day,
                                           X=test_X[est.begin:est.end],
                                           y=test_y[est.begin:est.end])
            for est in tqdm(fit_estimators))

        results = [r for r in prediction_results if r is not None]
        df = pd.DataFrame(results)
        if df.empty:
            raise MessageException("TestWindows: Empty result dataframe!")
        #df.time = pd.to_datetime(df.time)
        #df = df.set_index('time')

        classification_records = [r for r in df.to_dict(orient='records')]
        # If save is true, save test instance and parameters
        mt = ModelTest(
            window=test_window,
            step=test_step,
            parameters=mp.parameters,
            features=[c for c in test_X.columns],
            test_interval=splits['test'],
            task_key=task_key or str(uuid4()),
            classification_results=classification_records,
        )
        # Populate classification report fields
        clf_report = flattened_classification_report_imbalanced(
            df.label, df.predicted)
        roc_report = roc_auc_report(
            df.label, df.predicted,
            df[[c for c in df.columns if '_proba_' in c]])
        clf_report.update(roc_report)
        mt.classification_report = clf_report

        # Save test into the model
        if kwargs.get('save'):
            return self.model_repo.append_test(model.id, mt)
        return mt

    # def get_test_models(self, *, pipeline: str, dataset: str, symbol: str, target: str, split=0.7, step=None,
    #                    task_key=None, window=None, **kwargs):
    #     _n_jobs = int(kwargs.get('n_jobs', cpu_count() / 2))
    #     model = self.get_model(pipeline=pipeline, dataset=dataset, symbol=symbol, target=target)
    #     ds = self.dataset_service.get_dataset(name=model.dataset, symbol=model.symbol)
    #     for t in enumerate(model.tests):
    #         self.dataset_service
    #         estimator_names = Parallel(n_jobs=_n_jobs)(
    #             delayed(load_estimator)(
    #                 estimator=est,
    #                 model=model,
    #                 parameters=t.parameter_search_method,
    #                 features=features,
    #                 day=day,
    #                 window=window
    #             )
    #             for est in tqdm(fit_estimators))

    @staticmethod
    def load_test_estimators(model: Model, mt: ModelTest, **kwargs):
        results = ModelService.parse_test_results(mt)
        test_days = [d for d in results.time]
        _n_jobs = int(kwargs.get('n_jobs', cpu_count() / 2))
        logging.info(f"Loading {len(test_days)} estimators..")
        estimators = Parallel(n_jobs=_n_jobs)(
            delayed(load_estimator)(model=model,
                                    parameters='gridsearch',
                                    features='importances_shap',
                                    day=day,
                                    window=mt.window)
            for day in tqdm(test_days))

        return estimators

    def compare_models(self,
                       symbol: str,
                       dataset: str,
                       target: str,
                       pipeline: Optional[str] = None):
        if pipeline:
            tests = self.model_repo.find_tests(symbol=symbol,
                                               dataset=dataset,
                                               target=target,
                                               pipeline=pipeline)
        else:
            tests = self.model_repo.find_tests(symbol=symbol,
                                               dataset=dataset,
                                               target=target)
        return tests

    def predict_day(self, pipeline: str, dataset: str, target: str,
                    symbol: str, day: str, window: dict):
        model = self.get_model(pipeline, dataset, target, symbol)
        # Load dataset
        ds = DatasetService()
        d = ds.get_dataset(model.dataset, model.symbol)
        # Get training data including the first training window
        begin = sub_interval(timestamp=day, interval=window)
        if from_timestamp(d.valid_index_min).timestamp() > from_timestamp(
                begin).timestamp():
            raise MessageException("Not enough data for training! [Pipeline: {} Dataset: {} Symbol: {} Window: {}]" \
                                   .format(model.pipeline, model.dataset, model.symbol, window))
        X = ds.get_features(model.dataset, model.symbol, begin=begin, end=day)
        y = ds.get_target(model.target, model.symbol, begin=begin, end=day)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}-{}]Training data contains less than 2 classes: {}".
                format(model.symbol, model.dataset, model.target,
                       model.pipeline, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))

        # Load pipeline
        pipeline_module = get_pipeline(model.pipeline)
        # Slice testing interval in windows

        df = predict_day(pipeline_module.estimator, model.parameters[-1], X, y,
                         day)

        return df
Exemplo n.º 18
0
class GridSearchService:
    def __init__(self):
        self.model_repo = ModelRepository()
        self.model_service = ModelService()
        self.dataset_service = DatasetService()

    def create_parameters_search(self, model: Model, split: float,
                                 **kwargs) -> ModelParameters:
        ds = self.dataset_service.get_dataset(model.dataset, model.symbol)
        splits = DatasetService.get_train_test_split_indices(ds, split)

        # Features can either be a list of features to use, or a string
        #   If it is a string, and it is "latest", pick the latest
        features = kwargs.get('features')
        # if isinstance(features, str) and features == 'latest':
        #     if model.features:
        #         features = model.features[-1].features
        #     else:
        #         features = None
        if features:
            target = kwargs.get('target', 'class')
            mf = DatasetService.get_feature_selection(
                ds=ds, method=kwargs.get('features'), target=target)
            if not mf:
                raise MessageException(
                    f"Feature selection not found for {model.dataset}.{model.symbol} -> {target}!"
                )
            features = mf.features

        # Determine K for K-fold cross validation based on dataset's sample count
        # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples
        # so we need X samples where X is given by the proportion:
        #       30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold
        X = 40
        k = 5
        # If samples per fold with 5-fold CV are too low, use 3-folds
        if ds.count / k < X:
            k = 3
        # If samples are still too low, raise a value error
        if ds.count / k < X and not kwargs.get("permissive"):
            raise ValueError("Not enough samples to perform cross validation!")

        result = ModelParameters(cv_interval=splits['train'],
                                 cv_splits=k,
                                 task_key=kwargs.get('task_key', str(uuid4())),
                                 features=features or None)
        return result

    def _get_dataset_and_pipeline(self, model: Model, mp: ModelParameters,
                                  **kwargs):
        if not model.id:  # Make sure the task exists
            model = self.model_repo.create(model)
        if self.model_repo.exist_parameters(model.id, mp.task_key):
            logging.info("Model {} Grid search {} already executed!".format(
                model.id, mp.task_key))
            return mp

        # Load dataset
        X = self.dataset_service.get_features(model.dataset,
                                              model.symbol,
                                              mp.cv_interval.begin,
                                              mp.cv_interval.end,
                                              columns=mp.features)
        y = self.dataset_service.get_target(model.target, model.symbol,
                                            mp.cv_interval.begin,
                                            mp.cv_interval.end)

        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}-{}]Training data contains less than 2 classes: {}".
                format(model.symbol, model.dataset, model.target,
                       model.pipeline, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))
        logging.info("Dataset loaded: X {} y {} (unique: {})".format(
            X.shape, y.shape, unique))
        # Load pipeline
        pipeline_module = get_pipeline(model.pipeline)
        return pipeline_module, X, y

    def grid_search(self, model: Model, mp: ModelParameters,
                    **kwargs) -> ModelParameters:
        pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp)
        tag = "{}-{}-{}-{}-{}" \
            .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters))

        # Perform search
        if not kwargs.get('halving'):
            gscv = GridSearchCV(
                estimator=pipeline_module.estimator,
                param_grid=kwargs.get('parameter_grid',
                                      pipeline_module.PARAMETER_GRID),
                # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits),
                cv=StratifiedKFold(n_splits=mp.cv_splits),
                scoring=get_precision_scorer(),
                verbose=kwargs.get("verbose", 0),
                n_jobs=kwargs.get("n_jobs", None),
                refit=False)
        else:
            gscv = HalvingGridSearchCV(
                estimator=pipeline_module.estimator,
                param_grid=kwargs.get('parameter_grid',
                                      pipeline_module.PARAMETER_GRID),
                factor=2,
                cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits),
                scoring=get_precision_scorer(),
                verbose=kwargs.get("verbose", 0),
                n_jobs=kwargs.get("n_jobs",
                                  cpu_count() / 2),
                refit=False,
                random_state=0)

        try:
            mp.start_at = get_timestamp()  # Log starting timestamp
            gscv.fit(X, y)
            mp.end_at = get_timestamp()  # Log ending timestamp
        except SplitException as e:
            logging.exception(
                "Model {} splitting yields single-class folds!\n{}".format(
                    tag, e.message))
            return mp  # Fit failed, don't save this.
        except ValueError as e:
            logging.exception("Model {} raised ValueError!\n{}".format(tag, e))
            return mp  # Fit failed, don't save this.

        # Collect results
        results_df = pd.DataFrame(gscv.cv_results_)

        # Update search request with results
        mp.parameter_search_method = 'halving_grid_search' if kwargs.get(
            'halving') else 'gridsearch'
        mp.parameters = gscv.best_params_
        mp.cv_results = results_df.to_dict()
        mp.result_file = 'cv_results-{}.csv'.format(tag)

        # Save grid search results on storage
        if kwargs.get('save', True):
            storage_service.upload_json_obj(mp.parameters,
                                            'grid-search-results',
                                            'parameters-{}.json'.format(tag))
            storage_service.save_df(results_df, 'grid-search-results',
                                    mp.result_file)
            # Update model with the new results
            self.model_repo.append_parameters(model.id, mp)

        return mp

    def random_search(self, model: Model, mp: ModelParameters,
                      **kwargs) -> ModelParameters:
        pipeline_module, X, y = self._get_dataset_and_pipeline(model, mp)
        tag = "{}-{}-{}-{}-{}" \
            .format(model.symbol, model.dataset, model.target, model.pipeline, dict_hash(mp.parameters))

        rscv = RandomizedSearchCV(estimator=pipeline_module.estimator,
                                  param_distributions=kwargs.get(
                                      'param_distributions',
                                      pipeline_module.PARAMETER_DISTRIBUTION),
                                  n_iter=kwargs.get('n_iter', 10),
                                  cv=StratifiedKFold(n_splits=mp.cv_splits),
                                  scoring=get_precision_scorer(),
                                  verbose=kwargs.get("verbose", 0),
                                  n_jobs=kwargs.get("n_jobs", None),
                                  refit=False,
                                  random_state=0)

        try:
            mp.start_at = get_timestamp()  # Log starting timestamp
            rscv.fit(X, y)
            mp.end_at = get_timestamp()  # Log ending timestamp
        except SplitException as e:
            logging.exception(
                "Model {} splitting yields single-class folds!\n{}".format(
                    tag, e.message))
            return mp  # Fit failed, don't save this.
        except ValueError as e:
            logging.exception("Model {} raised ValueError!\n{}".format(tag, e))
            return mp  # Fit failed, don't save this.

        # Collect results
        results_df = pd.DataFrame(rscv.cv_results_)

        # Update search request with results
        mp.parameter_search_method = 'randomsearch'
        mp.parameters = rscv.best_params_
        mp.result_file = 'cv_results-{}.csv'.format(tag)

        # Save grid search results on storage
        if kwargs.get('save', True):
            storage_service.upload_json_obj(mp.parameters,
                                            'random-search-results',
                                            'parameters-{}.json'.format(tag))
            storage_service.save_df(results_df, 'random-search-results',
                                    mp.result_file)
            # Update model with the new results
            self.model_repo.append_parameters(model.id, mp)

        return mp

    def grid_search_new(self, symbol: str, dataset: str, target: str,
                        pipeline: str, split: float,
                        feature_selection_method: str, **kwargs):
        # Check if a model exists and has same search method
        existing_model = self.model_service.get_model(pipeline=pipeline,
                                                      dataset=dataset,
                                                      target=target,
                                                      symbol=symbol)
        if existing_model:
            mp_exists = ModelService.get_model_parameters(existing_model,
                                                          method='gridsearch')
            if mp_exists:
                if kwargs.get('replace'):
                    self.model_service.remove_parameters(model=existing_model,
                                                         method='gridsearch')
                else:
                    if kwargs.get('save'):
                        raise MessageException(
                            f"Grid search already performed for {pipeline}({dataset}.{symbol}) -> {target}"
                        )

        # Retrieve dataset to use
        ds = self.dataset_service.get_dataset(dataset, symbol)

        # Determine cv_splits=K for K-fold cross validation based on dataset's sample count
        # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples
        # so we need X samples where X is given by the proportion:
        #       30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold
        X = 40
        cv_splits = 5
        # If samples per fold with 5-fold CV are too low, use 3-folds
        if ds.count / cv_splits < X:
            cv_splits = 3
        # If samples are still too low, raise a value error
        if ds.count / cv_splits < X and not kwargs.get("permissive"):
            raise ValueError("Not enough samples to perform cross validation!")

        # Determine split indices based on dataset
        splits = DatasetService.get_train_test_split_indices(ds, split)
        cv_interval = splits['train']

        # Load dataset features by applying a specified feature selection method
        X = self.dataset_service.get_dataset_features(
            ds=ds,
            begin=cv_interval['begin'],
            end=cv_interval['end'],
            method=feature_selection_method,
            target=target)
        y = self.dataset_service.get_target(
            name=target,
            symbol=symbol,
            begin=cv_interval['begin'],
            end=cv_interval['end'],
        )

        # Check number of samples for each class in training data, if less than 3 instances are present for
        # each class, we're going to get a very unstable model (or no model at all for k-NN based algos)
        unique, counts = np.unique(y, return_counts=True)
        if len(unique) < 2:
            logging.error(
                "[{}-{}-{}-{}]Training data contains less than 2 classes: {}".
                format(symbol, dataset, target, pipeline, unique))
            raise MessageException(
                "Training data contains less than 2 classes: {}".format(
                    unique))
        logging.info("Dataset loaded: X {} y {} (unique: {})".format(
            X.shape, y.shape, unique))

        # Load pipeline algorithm and parameter grid
        pipeline_module = get_pipeline(pipeline)

        # Perform search
        gscv = GridSearchCV(
            estimator=pipeline_module.estimator,
            param_grid=kwargs.get('parameter_grid',
                                  pipeline_module.PARAMETER_GRID),
            # cv=BlockingTimeSeriesSplit(n_splits=mp.cv_splits),
            cv=StratifiedKFold(n_splits=cv_splits),
            scoring=get_precision_scorer(),
            verbose=kwargs.get("verbose", 0),
            n_jobs=kwargs.get("n_jobs", None),
            refit=False)

        mp = ModelParameters(cv_interval=splits['train'],
                             cv_splits=cv_splits,
                             task_key=kwargs.get('task_key', str(uuid4())),
                             features=[c for c in X.columns],
                             parameter_search_method='gridsearch')

        mp.start_at = get_timestamp()
        gscv.fit(X, y)
        mp.end_at = get_timestamp()

        # Collect results
        results_df = pd.DataFrame(gscv.cv_results_)

        mp.parameters = gscv.best_params_
        mp.cv_results = results_df.loc[:,
                                       results_df.columns != 'params'].to_dict(
                                           'records')

        tag = "{}-{}-{}-{}-{}".format(symbol, dataset, target, pipeline,
                                      dict_hash(mp.parameters))
        mp.result_file = 'cv_results-{}.csv'.format(tag)

        # Is there an existing model for this search?

        model = Model(pipeline=pipeline,
                      dataset=dataset,
                      target=target,
                      symbol=symbol,
                      features=feature_selection_method)
        model.parameters.append(mp)
        self.model_repo.create(model)

        # Save grid search results on storage
        if kwargs.get('save', True):
            storage_service.upload_json_obj(mp.parameters,
                                            'grid-search-results',
                                            'parameters-{}.json'.format(tag))
            storage_service.save_df(results_df, 'grid-search-results',
                                    mp.result_file)
        return mp
Exemplo n.º 19
0
def main(pipeline: str, dataset: str, symbol: str, window: int):
    ds = DatasetService()
    ms = ModelService()
    ts = TradingService()
    ohlcv_ds = ds.get_dataset('ohlcv', symbol=symbol)
    ohlcv = ds.get_dataset_features(
        ohlcv_ds)  # [ohlcv_ds.valid_index_min:ohlcv_ds.valid_index_max]

    # boll = pd.Series(percent_b(ohlcv.close, 21), index=ohlcv.index)
    boll = pd.Series(to_discrete_double(percent_b(ohlcv.close, 21), 20, 80),
                     index=ohlcv.index).replace(to_replace=-1, value=np.nan)

    #model = ms.get_model(pipeline, dataset, 'class', symbol)
    _test = ms.get_test(pipeline, dataset, 'class', symbol, window)
    for test in [
            _test
    ]:  # I originally traded all the tests in the model. ToDo: Refactor this.
        # Re-convert classification results from test to a DataFrame
        ohlcv_results = ohlcv[test.test_interval.begin:test.test_interval.end]
        results = ModelService.parse_test_results(test)

        #results.index = ohlcv_results.index
        # Parse index so it's a DateTimeIndex, because Mongo stores it as a string
        # results.index = pd.to_datetime(results.index)

        asset = ts.get_asset(pipeline=pipeline,
                             dataset=dataset,
                             target='class',
                             symbol=symbol,
                             window=test.window['days'])
        # Now use classification results to trade!
        day_count = results.shape[0]
        cur_day = 0
        print(
            "%B_Precision = {}",
            precision_score(results.label,
                            boll.loc[results.index],
                            average='macro',
                            zero_division=0))
        # Amount to buy in coins for buy and hold: $10k divided by first price in test set
        bh_price = ohlcv.close.loc[test.test_interval.begin]
        bh_amount = 10000 / bh_price

        for index, pred in results.iterrows():
            cur_day += 1
            # Get simulation day by converting Pandas' Timestamp to our format
            simulation_day = to_timestamp(index.to_pydatetime())
            # Results dataframe interprets values as float, while they are actually int
            predicted, label = int(pred.predicted), int(pred.label)

            # Grab ohlcv values for current day
            try:
                values = ohlcv.loc[index]
            except KeyError:
                print(f"Day: {index} not in OHLCV index!")
                continue
            try:
                boll_sig = boll.loc[
                    index] if boll.loc[index] != np.nan else None
            except KeyError:
                boll_sig = None
                print(f"Day: {index} not in BOLL index!")
                pass
            _index = ohlcv.index.get_loc(index)
            change = TradingService.get_percent_change(values.close,
                                                       values.open)

            print(
                f"Day {cur_day}/{day_count} [{index}] "
                f"[O {values.open} H {values.high} L {values.low} C {values.close}] "
                f"PCT={change}% "
                f"LABEL={TARGETS[label]} BPRED={TARGETS[boll_sig]} PRED={TARGETS[predicted]}"
            )
            open_positions = ts.get_open_positions(asset=asset,
                                                   day=simulation_day)
            for p in open_positions:
                p_age = TradingService.get_position_age(position=p,
                                                        day=simulation_day)
                try:
                    if p.type == 'MARGIN_LONG':
                        if TradingService.check_stop_loss(p, values.low):
                            ts.close_long(asset=asset,
                                          day=simulation_day,
                                          close_price=p.stop_loss,
                                          position=p,
                                          detail='Stop Loss')
                        elif TradingService.check_take_profit(p, values.high):
                            ts.close_long(asset=asset,
                                          day=simulation_day,
                                          close_price=p.take_profit,
                                          position=p,
                                          detail='Take Profit')
                        elif predicted == SELL:
                            ts.close_long(asset=asset,
                                          day=simulation_day,
                                          close_price=values.close,
                                          position=p,
                                          detail='Sell Signal')
                        elif predicted == HOLD and p_age > 86400 * 3:
                            ts.close_long(asset=asset,
                                          day=simulation_day,
                                          close_price=values.close,
                                          position=p,
                                          detail='Age')
                        elif predicted == BUY:
                            if change > 0:
                                ts.update_stop_loss(asset=asset,
                                                    position=p,
                                                    close_price=values.close,
                                                    pct=-0.05)
                    elif p.type == 'MARGIN_SHORT':
                        if TradingService.check_stop_loss(p, values.high):
                            ts.close_short(asset=asset,
                                           day=simulation_day,
                                           close_price=p.stop_loss,
                                           position=p,
                                           detail='Stop Loss')
                        elif TradingService.check_take_profit(p, values.low):
                            ts.close_short(asset=asset,
                                           day=simulation_day,
                                           close_price=p.take_profit,
                                           position=p,
                                           detail='Take Profit')
                        elif predicted == SELL:
                            # If we had some profit and signal is still SELL, book those by lowering stop loss
                            if change < 0:
                                ts.update_stop_loss(asset=asset,
                                                    position=p,
                                                    close_price=values.close,
                                                    pct=0.05)
                        elif predicted == HOLD and p_age > 86400 * 3:
                            ts.close_short(asset=asset,
                                           day=simulation_day,
                                           close_price=values.close,
                                           position=p,
                                           detail='Age')
                        elif predicted == BUY:
                            ts.close_short(asset=asset,
                                           day=simulation_day,
                                           close_price=values.close,
                                           position=p,
                                           detail='Buy Signal')
                except MessageException as e:
                    print(f"Order handling exception: {e.message}")

            try:
                # If prediction is BUY (price will rise) then open a MARGIN LONG position
                if predicted == BUY:
                    ts.open_long(asset=asset,
                                 day=simulation_day,
                                 close_price=values.close,
                                 size=0.1,
                                 stop_loss=-0.1,
                                 take_profit=0.05)
                # If prediction is SELL (price will drop) open a MARGIN SHORT position
                elif predicted == SELL:
                    ts.open_short(asset=asset,
                                  day=simulation_day,
                                  close_price=values.close,
                                  size=0.1,
                                  stop_loss=0.1,
                                  take_profit=-0.05)
            except MessageException as e:
                print(f"Order placement exception: {e.message}")

            # If this is the last trading day of the period, close all open positions
            if index.timestamp() == results.index[-1].timestamp():
                print("Last trading day reached, liquidating all positions..")
                open_positions = ts.get_open_positions(asset=asset,
                                                       day=simulation_day)
                for p in open_positions:
                    try:
                        if p.type == 'MARGIN_LONG':
                            ts.close_long(asset=asset,
                                          day=simulation_day,
                                          close_price=values.close,
                                          position=p,
                                          detail='Liquidation')
                        elif p.type == 'MARGIN_SHORT':
                            ts.close_short(asset=asset,
                                           day=simulation_day,
                                           close_price=values.close,
                                           position=p,
                                           detail='Liquidation')
                    except MessageException as e:
                        print(f"Order liquidation exception: {e.message}")

            # Update equity value for the asset
            ts.update_equity(asset=asset,
                             day=simulation_day,
                             price=values.close)
            # Update baseline values for the asset
            ts.update_baseline(asset=asset,
                               day=simulation_day,
                               name='buy_and_hold',
                               value=values.close * bh_amount)

        print("Timeframe done.")
Exemplo n.º 20
0
def main(dataset: str):
    dss = DatasetService()
    records = []
    for symbol in SYMBOLS:
        ds = dss.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds, 'importances_shap',
                                                  'class')
        target = dss.get_dataset_target(ds=ds, name='class')
        uniq, cnt = np.unique(target, return_counts=True)
        if cnt[0] + cnt[1] + cnt[2] != ds.count:
            print(f"Mismatch between classes and count in {symbol}")
        mindt = from_timestamp(ds.valid_index_min)
        maxdt = from_timestamp(ds.valid_index_max)
        daysn = (maxdt - mindt).days
        records.append({
            'Pair': symbol,
            'num_features': len(ds.features),
            'sel_features': len(fs.features),
            'min_index': ds.valid_index_min,
            'max_index': ds.valid_index_max,
            'valid_days': daysn,
            'records': ds.count,
            'sell_count': cnt[0],
            'hold_count': cnt[1],
            'buy_count': cnt[2]
        })
    df = pd.DataFrame.from_records(records)
    fig = px.timeline(df, x_start="min_index", x_end="max_index", y="Pair")
    fig.update_yaxes(
        autorange="reversed")  # otherwise tasks are listed from the bottom up
    #fig.show()
    fig.update_layout(title={
        'text': f"Sample distribution across datasets",
        'y': 0.9,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
                      paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0.5)',
                      font={'color': 'White'},
                      margin={
                          'l': 5,
                          'r': 5,
                          't': 80,
                          'b': 5,
                          'pad': 5
                      })
    fig.write_image("images/data_summary/timeline.png")
    for symbol in SYMBOLS:
        sdf = df[df.Pair == symbol]
        pie_values = [
            sdf['sell_count'].values[0], sdf['hold_count'].values[0],
            sdf['buy_count'].values[0]
        ]
        pie_labels = ['SELL', 'HOLD', 'BUY']
        sfig = go.Figure(data=[
            go.Pie(
                labels=pie_labels,
                values=pie_values,
                textinfo='label+percent',
                #insidetextorientation='radial',
                showlegend=False)
        ])
        sfig.update_layout(title={
            'text': f"Class distribution for pair {symbol}",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': {
                'size': 22
            }
        },
                           paper_bgcolor='rgba(0,0,0,0)',
                           plot_bgcolor='rgba(0,0,0,0)',
                           font={
                               'color': 'White',
                               'size': 26
                           },
                           margin={
                               'l': 0,
                               'r': 0,
                               't': 80,
                               'b': 0,
                               'pad': 0
                           },
                           uniformtext_minsize=24)

        sfig.write_image(f"images/data_summary/{symbol}_distribution.png")
    print(df.head())
Exemplo n.º 21
0
def main(dataset: str):
    ds = DatasetService()
    ms = ModelService()
    ts = TradingService()
    logs = []
    for pipeline in PIPELINES:
        for symbol in SYMBOLS:
            for window in WINDOWS:
                print(
                    f"PIPELINE: {pipeline} SYMBOL: {symbol} WINDOW: {window}")
                ohlcv_ds = ds.get_dataset('ohlcv', symbol=symbol)
                test = ms.get_test(pipeline=pipeline,
                                   dataset=dataset,
                                   target='class',
                                   symbol=symbol,
                                   window=window)
                if not test:
                    print(
                        f"Test {pipeline}.{dataset}.class for {symbol} on window {window} not found!"
                    )
                    logs.append(
                        f"MISSING_TEST {pipeline} {dataset} {symbol} class {window} --features importances_shap --parameters gridsearch\n"
                    )
                    continue
                asset = ts.get_asset(pipeline=pipeline,
                                     dataset=dataset,
                                     target='class',
                                     symbol=symbol,
                                     window=window,
                                     create=False)
                if not asset:
                    print(
                        f"Asset {pipeline}.{dataset}.class for {symbol} on window {window} not found!"
                    )
                    logs.append(
                        f"MISSING_ASSET {pipeline} {dataset} {symbol} {window}\n"
                    )
                    continue

                equity = TradingService.parse_equity_df(asset=asset)
                buy_and_hold = TradingService.parse_baseline_df(
                    asset=asset, name='buy_and_hold')
                orders = TradingService.parse_orders_df(asset=asset)

                # Map order position_id to numbers so we don't get a mess in the graph
                position_uids = set(orders.position_id.values)
                for i, uid in enumerate(position_uids):
                    orders.position_id.replace(to_replace=uid,
                                               value=i,
                                               inplace=True)

                ohlcv = ds.get_dataset_features(ohlcv_ds,
                                                begin=test.test_interval.begin,
                                                end=test.test_interval.end)
                test_results = ModelService.parse_test_results(test).iloc[:-1]
                # Mask predictions with low value minus a certain amount
                signals_level_diff = ohlcv.low * 10 / 100
                signals_level = ohlcv.low - signals_level_diff
                enc_pred = onehot_target(
                    test_results.predicted,
                    labels=["is_sell", "is_hold", "is_buy"],
                    fill=False)
                #  In case of classifier bias (due to input bias) some classes are ignored.
                # In such cases, enc_pred won't contain the ignored classes.
                # Add them back by nan-filling (never selected)
                if hasattr(enc_pred, 'is_sell'):
                    use_idx = enc_pred.is_sell > 0
                    enc_pred.is_sell.mask(
                        use_idx,
                        other=signals_level.loc[enc_pred.index],
                        inplace=True)
                else:
                    enc_pred['is_sell'] = pd.Series(np.nan,
                                                    index=enc_pred.index)
                if hasattr(enc_pred, 'is_hold'):
                    enc_pred.is_hold.mask(
                        enc_pred.is_hold > 0,
                        other=signals_level.loc[enc_pred.index],
                        inplace=True)
                else:
                    enc_pred['is_hold'] = pd.Series(np.nan,
                                                    index=enc_pred.index)
                if hasattr(enc_pred, 'is_buy'):
                    enc_pred.is_buy.mask(
                        enc_pred.is_buy > 0,
                        other=signals_level.loc[enc_pred.index],
                        inplace=True)
                else:
                    enc_pred['is_buy'] = pd.Series(np.nan,
                                                   index=enc_pred.index)

                # Get unique years in index to split plots in smaller scale
                unique_years = ohlcv.index.year.unique()
                for year in unique_years:
                    year_ohlcv = ohlcv[ohlcv.index.year == year]
                    year_pred = enc_pred[enc_pred.index.year == year]
                    year_equity = equity[equity.index.year == year]
                    year_buy_and_hodl = buy_and_hold[buy_and_hold.index.year ==
                                                     year]
                    year_orders = orders[orders.index.year == year]

                    unique_quarters = year_ohlcv.index.quarter.unique()
                    for quarter in unique_quarters:
                        q_ohlcv = year_ohlcv[year_ohlcv.index.quarter ==
                                             quarter]
                        q_pred = year_pred[year_pred.index.quarter == quarter]
                        q_equity = year_equity[year_equity.index.quarter ==
                                               quarter]
                        q_orders = year_orders[year_orders.index.quarter ==
                                               quarter]
                        q_buy_and_hodl = year_buy_and_hodl[
                            year_buy_and_hodl.index.quarter == quarter]
                        #f"{ohlcv_ds.symbol}, {year} - Q{quarter}, 1D", 'Trades', 'Equity'
                        img_path = f"images/backtests-final/{pipeline}-{dataset}-class-W{window}/{symbol}/"
                        img_name = f"trades-{year}-Q{quarter}.png"
                        if os.path.exists(f"{img_path}/{img_name}"):
                            print(f"[SKIP] File exists {img_path}/{img_name}")
                            continue
                        make_plot(
                            ohlcv=q_ohlcv,
                            orders=q_orders,
                            equity=q_equity,
                            baselines=[('Buy and Hold', q_buy_and_hodl)],
                            pred=q_pred,
                            signals_title=
                            f"{ohlcv_ds.symbol}, {pipeline}, W={window}D, {year} - Q{quarter}, 1D",
                            img_path=img_path,
                            img_name=img_name,
                            bollinger=True)
                        print(
                            f"{year}-Q{quarter} saved to {img_path}{img_name}")
    with open(f"trading_plotly.{dataset}.log", "w") as f:
        f.writelines(logs)
    print("Logs saved")