Python DatasetService.get_feature_selection示例

编程语言: Python

命名空间/包名称: cryptoml_core.services.dataset_service

类/类型: DatasetService

方法/功能: get_feature_selection

hotexamples.com的示例: 11

Python DatasetService.get_feature_selection - 已找到11个示例。这些是从开源项目中提取的最受好评的cryptoml_core.services.dataset_service.DatasetService.get_feature_selection现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

DatasetService(24)

get_dataset(21)

get_dataset_features(11)

get_feature_selection(11)

get_dataset_target(6)

get_features(6)

get_target(6)

get_train_test_split_indices(6)

get_dataset_symbols(4)

import_from_storage(3)

find_by_symbol(1)

get(1)

create_dataset(1)

append_feature_selection(1)

all(1)

get_x_y(1)

has_feature_selection(1)

merge_datasets(1)

query(1)

remove_feature_selection(1)

示例#1

显示文件

    def create_model_test(self,
                          *,
                          model: Model,
                          split=0.7,
                          step=None,
                          task_key=None,
                          window=None,
                          **kwargs):
        service = DatasetService()
        ds = service.get_dataset(model.dataset, model.symbol)
        splits = DatasetService.get_train_test_split_indices(ds, split)
        parameters = kwargs.get('parameters')
        features = kwargs.get('features')
        if isinstance(parameters, str) and parameters == 'latest':
            if model.parameters:
                parameters = model.parameters[-1].parameters
            else:
                parameters = None

        if isinstance(features, str):
            fs = DatasetService.get_feature_selection(ds=ds,
                                                      method=features,
                                                      target=model.target)
            if fs:
                features = fs.features
            else:
                features = None
        result = ModelTest(window=window or {'days': 30},
                           step=step or ds.interval,
                           parameters=parameters or {},
                           features=features or [],
                           test_interval=splits['test'],
                           task_key=task_key or str(uuid4()))
        return result

示例#2

显示文件

def main(dataset: str, target: str, symbol: str):
    ds_service = DatasetService()
    ds = ds_service.get_dataset(name=dataset, symbol=symbol)
    fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target='class')
    # hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml", importances=fs.feature_importances)

    # hdf = pd.DataFrame(hierarchy)
    # fig = px.treemap(hdf, path=['category', 'subgroup', 'name'], values='importance')
    # fig.show()
    #
    # fig = px.sunburst(hdf, path=['category', 'subgroup', 'name'], values='importance')
    # fig.show()

    shap_values, shap_expected_values = parse_shap_values(fs.shap_values)
    X = ds_service.get_dataset_features(ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end)
    y = ds_service.get_target(name='class', symbol=symbol, begin=fs.search_interval.begin, end=fs.search_interval.end)
    fig = plt.figure()
    plt.suptitle(f"Shap summary plot for {dataset}.{symbol} -> {target}")
    shap.summary_plot(shap_values, X, class_names=["SELL", "HOLD", "BUY"], show=False, max_display=352, use_log_scale=True)
    plt.tight_layout()
    fig.show()

    shap_dfs = []
    for cls, arr in enumerate(shap_values):
        class_df = pd.DataFrame(arr, columns=X.columns, index=X.index)
        class_df.columns = [f"{c}_class{cls}" for c in class_df.columns]
        shap_dfs.append(class_df)
    shap_df = pd.concat(shap_dfs, axis='columns')
    shap_df = shap_df.reindex(sorted(shap_df.columns), axis=1)
    print(shap_df.head())

示例#3

显示文件

文件： check_models.py 项目： RedLicorice/CryptoML-API

def main():
    models = ModelService()
    datasets = DatasetService()
    query = {
        "dataset": "merged_new",
        "target": "class"
    }
    all_models = models.query_models(query=query)
    for m in all_models:
        ds = datasets.get_dataset(name=m.dataset, symbol=m.symbol)
        fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target=m.target)
        if not fs:
            logging.error(f"Dataset {m.dataset}{m.symbol} -> {m.target} does not have feature selection")
            continue

        if not m.parameters:
            logging.error(f"Model {m.pipeline}({m.dataset}{m.symbol}) -> {m.target} does not have parameters")
            continue

        for mp in m.parameters:
            count = 0
            for f in mp.features:
                if not f in fs.features:
                    logging.error(f"Model {m.pipeline}({m.dataset}{m.symbol}) -> {m.target} parameter search done without fixing features!")
                else:
                    count += 1
            logging.info(f"Model {m.pipeline}({m.dataset}{m.symbol}) -> {m.target} GRIDSEARCH {mp.parameter_search_method} done with {count} features")

示例#4

显示文件

def main(dataset: str, target: str, pipeline: str):
    shapes = []
    ds_service = DatasetService()
    m_service = ModelService()
    for symbol in SYMBOLS:
        print(f"Exporting shap dataframes for symbol {symbol}")
        ds = ds_service.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds=ds,
                                                  method='importances_shap',
                                                  target=target)
        X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features)
        y_all = ds_service.get_dataset_target(ds=ds, name=target)
        model = m_service.get_model(pipeline=pipeline,
                                    dataset=dataset,
                                    target=target,
                                    symbol=symbol)
        for t in model.tests:
            print(f"Loading estimators for test {t.window}")
            estimators = ModelService.load_test_estimators(model=model, mt=t)
            shaps = []
            print(f"Calculating shap values...")
            for est in tqdm(estimators):
                est_class = y_all.loc[est.day]
                shap_v, shap_exp = get_shap_values(estimator=est,
                                                   X=X_all.loc[est.day],
                                                   X_train=est.train_x,
                                                   bytes=False)
                df = pd.DataFrame([shap_v],
                                  index=[pd.to_datetime(est.day)],
                                  columns=X_all.columns)
                df['label'] = y_all.loc[est.day]
                df['shap_expected'] = shap_exp
                shaps.append(df)
            print("Exporting dataframe..")
            cdf = pd.concat(shaps, axis='index')
            os.makedirs(f"data/shap_values/{dataset}/{target}/{pipeline}/",
                        exist_ok=True)
            cdf.to_csv(
                f"data/shap_values/{dataset}/{target}/{pipeline}/shap_test_{symbol}_Wdays{t.window['days']}.csv",
                index_label='time')
            print("Exported.")
            # # Load day estimator
            # est = load_estimator()

        print(f"Plotted {symbol}")

示例#5

显示文件

文件： grid_search.py 项目： RedLicorice/CryptoML-API

    def create_parameters_search(self, model: Model, split: float,
                                 **kwargs) -> ModelParameters:
        ds = self.dataset_service.get_dataset(model.dataset, model.symbol)
        splits = DatasetService.get_train_test_split_indices(ds, split)

        # Features can either be a list of features to use, or a string
        #   If it is a string, and it is "latest", pick the latest
        features = kwargs.get('features')
        # if isinstance(features, str) and features == 'latest':
        #     if model.features:
        #         features = model.features[-1].features
        #     else:
        #         features = None
        if features:
            target = kwargs.get('target', 'class')
            mf = DatasetService.get_feature_selection(
                ds=ds, method=kwargs.get('features'), target=target)
            if not mf:
                raise MessageException(
                    f"Feature selection not found for {model.dataset}.{model.symbol} -> {target}!"
                )
            features = mf.features

        # Determine K for K-fold cross validation based on dataset's sample count
        # Train-test split for each fold is 80% train, the lowest training window for accurate results is 30 samples
        # so we need X samples where X is given by the proportion:
        #       30/0.8 = X/1; X= 30/0.8 = 37.5 ~ 40 samples per fold
        X = 40
        k = 5
        # If samples per fold with 5-fold CV are too low, use 3-folds
        if ds.count / k < X:
            k = 3
        # If samples are still too low, raise a value error
        if ds.count / k < X and not kwargs.get("permissive"):
            raise ValueError("Not enough samples to perform cross validation!")

        result = ModelParameters(cv_interval=splits['train'],
                                 cv_splits=k,
                                 task_key=kwargs.get('task_key', str(uuid4())),
                                 features=features or None)
        return result

示例#6

显示文件

def main(dataset: str, target: str, symbol: str):
    ds_service = DatasetService()
    ds = ds_service.get_dataset(name=dataset, symbol=symbol)
    fs = DatasetService.get_feature_selection(ds=ds, method='importances_shap', target='class')

    # shap_values, shap_expected_values = parse_shap_values(fs.shap_values)
    # X = ds_service.get_dataset_features(ds=ds, begin=fs.search_interval.begin, end=fs.search_interval.end)

    # shap_df_0 = pd.DataFrame(data=shap_values[0], index=X.index, columns=X.columns)
    # shap_df_1 = pd.DataFrame(data=shap_values[1], index=X.index, columns=X.columns)
    # shap_df_2 = pd.DataFrame(data=shap_values[2], index=X.index, columns=X.columns)

    hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml", importances=fs.feature_importances)
    # for record in hierarchy:
    #     feature = record['name']
    #     try:
    #         record['shap_mean_0'] = shap_df_0[feature].mean()
    #         record['shap_mean_1'] = shap_df_1[feature].mean()
    #         record['shap_mean_2'] = shap_df_2[feature].mean()
    #     except KeyError as e:
    #         print(f"Feature {feature} not in dataset!")
    #         record['shap_mean_0'] = np.nan
    #         record['shap_mean_1'] = np.nan
    #         record['shap_mean_2'] = np.nan
    #         pass


    os.makedirs(f"data/selection_{dataset}_{target}/", exist_ok=True)

    hdf = pd.DataFrame(hierarchy)
    csv_name = f"data/selection_{dataset}_{target}/{symbol}_feature_importances.csv"
    hdf.to_csv(csv_name, index_label='index')
    print(f"Augmented importances dataframe exported to {csv_name}")

    csv_name = f"data/selection_{dataset}_{target}/{symbol}_feature_importances_selected.csv"
    hdf[hdf.name.isin(fs.features)].to_csv(csv_name, index_label='index')
    print(f"Augmented selected features dataframe exported to {csv_name}")

示例#7

显示文件

def main(dataset: str):
    dss = DatasetService()
    records = []
    for symbol in SYMBOLS:
        ds = dss.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds, 'importances_shap',
                                                  'class')
        target = dss.get_dataset_target(ds=ds, name='class')
        uniq, cnt = np.unique(target, return_counts=True)
        if cnt[0] + cnt[1] + cnt[2] != ds.count:
            print(f"Mismatch between classes and count in {symbol}")
        mindt = from_timestamp(ds.valid_index_min)
        maxdt = from_timestamp(ds.valid_index_max)
        daysn = (maxdt - mindt).days
        records.append({
            'Pair': symbol,
            'num_features': len(ds.features),
            'sel_features': len(fs.features),
            'min_index': ds.valid_index_min,
            'max_index': ds.valid_index_max,
            'valid_days': daysn,
            'records': ds.count,
            'sell_count': cnt[0],
            'hold_count': cnt[1],
            'buy_count': cnt[2]
        })
    df = pd.DataFrame.from_records(records)
    fig = px.timeline(df, x_start="min_index", x_end="max_index", y="Pair")
    fig.update_yaxes(
        autorange="reversed")  # otherwise tasks are listed from the bottom up
    #fig.show()
    fig.update_layout(title={
        'text': f"Sample distribution across datasets",
        'y': 0.9,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
                      paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0.5)',
                      font={'color': 'White'},
                      margin={
                          'l': 5,
                          'r': 5,
                          't': 80,
                          'b': 5,
                          'pad': 5
                      })
    fig.write_image("images/data_summary/timeline.png")
    for symbol in SYMBOLS:
        sdf = df[df.Pair == symbol]
        pie_values = [
            sdf['sell_count'].values[0], sdf['hold_count'].values[0],
            sdf['buy_count'].values[0]
        ]
        pie_labels = ['SELL', 'HOLD', 'BUY']
        sfig = go.Figure(data=[
            go.Pie(
                labels=pie_labels,
                values=pie_values,
                textinfo='label+percent',
                #insidetextorientation='radial',
                showlegend=False)
        ])
        sfig.update_layout(title={
            'text': f"Class distribution for pair {symbol}",
            'y': 0.9,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': {
                'size': 22
            }
        },
                           paper_bgcolor='rgba(0,0,0,0)',
                           plot_bgcolor='rgba(0,0,0,0)',
                           font={
                               'color': 'White',
                               'size': 26
                           },
                           margin={
                               'l': 0,
                               'r': 0,
                               't': 80,
                               'b': 0,
                               'pad': 0
                           },
                           uniformtext_minsize=24)

        sfig.write_image(f"images/data_summary/{symbol}_distribution.png")
    print(df.head())

示例#8

显示文件

文件： plot_shap_test_study.py 项目： RedLicorice/CryptoML-API

def main(dataset: str, target: str):
    num_shap_plots = 3
    shap_show_count = 10

    ds_service = DatasetService()
    m_service = ModelService()
    for pipeline in PIPELINES:
        for symbol in SYMBOLS:
            print(
                f"Plotting shap dataframes for pipeline {pipeline} symbol {symbol}"
            )
            ds = ds_service.get_dataset(name=dataset, symbol=symbol)
            fs = DatasetService.get_feature_selection(
                ds=ds, method='importances_shap', target=target)
            X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features)
            y_all = ds_service.get_dataset_target(ds=ds, name=target)
            model = m_service.get_model(pipeline=pipeline,
                                        dataset=dataset,
                                        target=target,
                                        symbol=symbol)
            for t in model.tests:
                placeholder = "{label}"
                csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv"
                expected_csv_name = csv_name.format(label='SHAP_expected')
                print(f"Loading results for test {t.window}")
                results = ModelService.parse_test_results(test=t)
                exp_shap_df = pd.read_csv(expected_csv_name,
                                          index_col='time',
                                          parse_dates=True)
                for cls, label in enumerate(["SELL", "HOLD", "BUY"]):
                    class_csv_name = csv_name.format(label=label)
                    cls_shap_df = pd.read_csv(class_csv_name,
                                              index_col='time',
                                              parse_dates=True)
                    cls_shap_df = cls_shap_df.loc[t.test_interval.begin:t.
                                                  test_interval.end]

                    x_train = X_all.loc[cls_shap_df.index]
                    chunk_size = int(cls_shap_df.shape[0] / num_shap_plots)

                    fig = plt.figure(constrained_layout=True,
                                     figsize=(100, 50),
                                     dpi=300)  #
                    gs = GridSpec(3,
                                  num_shap_plots,
                                  figure=fig,
                                  wspace=1.5,
                                  hspace=0.3)
                    precision_ax = fig.add_subplot(gs[0, :])
                    shap_values_ax = fig.add_subplot(gs[1, :])
                    beeswarms_axs = [
                        fig.add_subplot(gs[2, i])
                        for i in range(num_shap_plots)
                    ]
                    #format_axes(fig)
                    shap_plot_labels = set()
                    first_shap_day = results.iloc[0]['time'].replace(
                        '+00:00',
                        '').replace('T', '').replace(':', '').replace('-', '')
                    middle_shap_day = results.iloc[int(
                        results.shape[0] / 2)]['time'].replace(
                            '+00:00',
                            '').replace('T', '').replace(':',
                                                         '').replace('-', '')
                    last_shap_day = results.iloc[-1]['time'].replace(
                        '+00:00',
                        '').replace('T', '').replace(':', '').replace('-', '')
                    for idx, dayname in enumerate(
                        [first_shap_day, middle_shap_day, last_shap_day]):
                        day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/daily/shap_training_window_{symbol}_{label}_Wdays{t.window['days']}_DAY{dayname}.csv"

                        # Plot each section's SHAP values
                        cdf_subset = pd.read_csv(day_csv_name,
                                                 index_col='time',
                                                 parse_dates=True)
                        train_subset = X_all.loc[cdf_subset.index]

                        # Get a rank of feature labels based on this section's shap values
                        abs_mean_shap = cdf_subset.abs().mean(axis='index')
                        abs_mean_rank = abs_mean_shap.sort_values(
                            ascending=False)[:shap_show_count]
                        for l in abs_mean_rank.index:
                            # Save labels for features in the top-N
                            shap_plot_labels.add(l)

                        # Plot this section's SHAP values
                        plt.sca(beeswarms_axs[idx])
                        shap.summary_plot(cdf_subset.values,
                                          train_subset,
                                          max_display=shap_show_count,
                                          show=False,
                                          color_bar=False,
                                          sort=True)
                        min_date = cdf_subset.index.min().to_pydatetime()
                        max_date = cdf_subset.index.max().to_pydatetime(
                        ) + timedelta(days=1)
                        min_date_f = min_date.strftime("%Y/%m/%d")
                        max_date_f = max_date.strftime("%Y/%m/%d")
                        beeswarms_axs[idx].set_xlabel(
                            f"SHAP values\nWindow: {min_date_f} - {max_date_f}",
                            fontsize=8)
                        beeswarms_axs[idx].tick_params(axis='y',
                                                       which='major',
                                                       labelsize=6)
                        beeswarms_axs[idx].tick_params(axis='x',
                                                       which='major',
                                                       labelsize=8)

                    # Plot shap values
                    day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{label}_Wdays{t.window['days']}_.csv"
                    plot_cls_shap_df = pd.read_csv(day_csv_name,
                                                   index_col='time',
                                                   parse_dates=True)

                    def get_spread(series):
                        return np.abs(series.max() - series.min())

                    plot_rank = plot_cls_shap_df[list(shap_plot_labels)].apply(
                        get_spread, axis='index').sort_values(
                            ascending=False)[:shap_show_count]
                    plot_cls_shap_df['xlabel'] = [
                        t.to_pydatetime().strftime("%Y/%m/%d")
                        for t in plot_cls_shap_df.index
                    ]
                    shap_ax = plot_cls_shap_df.plot(
                        x='xlabel',
                        y=[c for c in plot_rank.index],
                        kind='line',
                        ax=shap_values_ax,
                        legend=False,
                        xlabel='')
                    patches, labels = shap_ax.get_legend_handles_labels()
                    shap_ax.legend(patches,
                                   labels,
                                   loc='center left',
                                   bbox_to_anchor=(1, 0.5),
                                   prop={'size': 6})
                    shap_ax.tick_params(axis='x', which='major', labelsize=8)
                    shap_ax.set_ylabel('mean(|SHAP|)', fontsize=6)
                    #shap_ax.tick_params(labelbottom=False, labelleft=False)

                    # Get Metrics scores dataframe
                    cri_df = get_metrics_df(results).rolling(
                        7, min_periods=1).mean()
                    cri_df['xlabel'] = [
                        t.to_pydatetime().strftime("%Y/%m/%d")
                        for t in cri_df.index
                    ]
                    cri_ax = cri_df.plot(x='xlabel',
                                         y=f"pre_{cls}",
                                         kind='line',
                                         ax=precision_ax,
                                         legend=False,
                                         xlabel='')
                    patches, labels = cri_ax.get_legend_handles_labels()
                    cri_ax.legend(patches,
                                  labels,
                                  loc='center left',
                                  bbox_to_anchor=(1, 0.5),
                                  prop={'size': 6})
                    cri_ax.set_ylabel('mean(precision)', fontsize=6)
                    cri_ax.tick_params(labelbottom=False, labelleft=True)

                    min_date = cri_df.index.min().to_pydatetime().strftime(
                        "%Y/%m/%d")
                    max_date = cri_df.index.max().to_pydatetime().strftime(
                        "%Y/%m/%d")
                    window = t.window['days']
                    fig.suptitle(
                        f"{symbol}, {pipeline}, W={window}D, Class {label}, From {min_date} to {max_date}"
                    )

                    # fig.show()
                    os.makedirs(f"images/shap-test-final/", exist_ok=True)
                    plt.savefig(
                        f"images/shap-test-final/{pipeline}_W{window}D_{dataset}_{target}_{symbol}_{label}.png",
                        dpi='figure')
                    plt.close()
                    print(f"{label} OK")

            print(f"Exported symbol {symbol}.")
            # # Load day estimator
            # est = load_estimator()

        print(f"Plotted {symbol}")

示例#9

显示文件

def main(dataset: str, target: str, pipeline: str):
    shapes = []
    ds_service = DatasetService()
    m_service = ModelService()
    for symbol in SYMBOLS:
        print(f"Exporting shap dataframes for symbol {symbol}")
        ds = ds_service.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds=ds,
                                                  method='importances_shap',
                                                  target=target)
        X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features)
        y_all = ds_service.get_dataset_target(ds=ds, name=target)
        model = m_service.get_model(pipeline=pipeline,
                                    dataset=dataset,
                                    target=target,
                                    symbol=symbol)
        for t in model.tests:
            os.makedirs(
                f"data/shap_values/{dataset}/{target}/{pipeline}/daily",
                exist_ok=True)
            placeholder = "{label}"
            csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv"
            day_csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/daily/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_"
            print(f"Loading estimators for test {t.window}")
            estimators = ModelService.load_test_estimators(model=model, mt=t)
            results = ModelService.parse_test_results(test=t)
            shaps = [[], [], []]

            X_test = X_all.loc[t.test_interval.begin:t.test_interval.end]
            shap_expected = []
            print(f"Calculating shap values")
            shap_abs_mean = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame()]
            for est in tqdm(estimators):
                est_class = y_all.loc[est.day]
                training_data = est.train_x.astype(np.float64).fillna(value=0)

                shap_v, shap_exp = get_shap_values(estimator=est.named_steps.c,
                                                   X=training_data,
                                                   X_train=training_data,
                                                   bytes=False)

                if isinstance(shap_exp, float):
                    shap_expected.append([est.day] + [0, 0, shap_exp])
                else:
                    shap_expected.append([est.day] + [v for v in shap_exp])
                for cls, label in enumerate(["SELL", "HOLD", "BUY"]):
                    df = pd.DataFrame(shap_v[cls],
                                      index=est.train_x.index,
                                      columns=est.train_x.columns)
                    # if not shaps[cls]: # If list is empty, append whole df
                    #     shaps[cls].append(df)
                    # else:
                    #     shaps[cls].append(df.iloc[-1:])  # otherwise only append new row (sliding window)
                    # Save shap values dataframe for each day
                    dayname = est.day.replace('+00:00',
                                              '').replace('T', '').replace(
                                                  ':', '').replace('-', '')
                    day_class_csv_name = day_csv_name.format(
                        label=label) + f"DAY{dayname}.csv"
                    df.to_csv(day_class_csv_name, index_label='time')

                    # Process data for next plot
                    df_abs_mean = df.abs().mean().to_dict()
                    df_abs_mean['time'] = est.day
                    shaps[cls].append(df_abs_mean)

                    # print(shap_abs_mean.head())

            # Merge shap values in an unique dataframe and save to csv for each class
            for cls, label in enumerate(["SELL", "HOLD", "BUY"]):
                class_csv_name = csv_name.format(label=label)
                print(
                    f"Exporting dataframe for class {label} -> {class_csv_name}"
                )
                # cdf = pd.concat(shaps[cls], axis='index')
                cdf = pd.DataFrame.from_records(shaps[cls])
                cdf.index = pd.to_datetime(cdf.time)
                cdf = cdf[cdf.columns.difference(['time'])]
                cdf.to_csv(class_csv_name, index_label='time')

            expected_csv_name = csv_name.format(label='SHAP_expected')
            print(
                f"Exporting expected values dataframe -> {expected_csv_name}")
            edf = pd.DataFrame(
                shap_expected,
                columns=[
                    "time", "shap_expected_sell", "shap_expected_hold",
                    "shap_expected_buy"
                ],
            )
            edf.to_csv(expected_csv_name, index_label='time')

            print(f"Exported symbol {symbol}.")
            # # Load day estimator
            # est = load_estimator()

        print(f"Plotted {symbol}")

示例#10

显示文件

def main(dataset: str, target: str):
    # hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml")
    # hdf = pd.DataFrame(hierarchy)

    shapes = []
    for symbol in SYMBOLS:
        ds_service = DatasetService()
        ds = ds_service.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds=ds,
                                                  method='importances_shap',
                                                  target=target)
        shap_v, shap_exp = parse_shap_values(fs.shap_values)

        X_train = ds_service.get_dataset_features(
            ds=ds,
            begin=fs.search_interval.begin,
            end=fs.search_interval.end  #,
            #columns=fs.features
        )
        shapes.append(X_train.shape[0])

        shap_0 = pd.DataFrame(shap_v[0],
                              index=X_train.index,
                              columns=X_train.columns)
        shap_1 = pd.DataFrame(shap_v[1],
                              index=X_train.index,
                              columns=X_train.columns)
        shap_2 = pd.DataFrame(shap_v[2],
                              index=X_train.index,
                              columns=X_train.columns)

        sel_train = X_train[fs.features]
        sel_shap_0 = shap_0[fs.features]
        sel_shap_1 = shap_1[fs.features]
        sel_shap_2 = shap_2[fs.features]

        show_count = 50  #len(fs.features)
        shap.summary_plot(sel_shap_0.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class SELL"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(sel_shap_1.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class HOLD"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(sel_shap_2.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class BUY"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_0.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class SELL"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_abs_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_1.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class HOLD"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_abs_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_2.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class BUY"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_abs_top{show_count}.png"
        )
        plt.close()

        show_count = 25
        shap.summary_plot(sel_shap_0.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class SELL"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(sel_shap_1.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class HOLD"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(sel_shap_2.values,
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"SHAP Summary plot for {symbol}, top {show_count} features for class BUY"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_0.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class SELL"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_SELL_abs_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_1.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class HOLD"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_HOLD_abs_top{show_count}.png"
        )
        plt.close()

        shap.summary_plot(np.abs(sel_shap_2.values),
                          sel_train,
                          max_display=show_count,
                          show=False)
        plt.tight_layout()
        plt.title(
            f"Absolute SHAP Summary plot for {symbol}, top {show_count} features for class BUY"
        )
        plt.savefig(
            f"images/shap-global/{dataset}_{target}__shap__summary_plot_{symbol}_BUY_abs_top{show_count}.png"
        )
        plt.close()

        print(f"Plotted {symbol}")

示例#11

显示文件

文件： plot_shap_test_study_hierarchy.py 项目： RedLicorice/CryptoML-API

def main(dataset: str, target: str, pipeline: str):
    hierarchy = load_hierarchy(f"{dataset}_{target}_feature_hierarchy.yml")
    hdf = pd.DataFrame(hierarchy)

    num_shap_plots = 3
    shap_show_count = 10

    ds_service = DatasetService()
    m_service = ModelService()
    for symbol in SYMBOLS:
        print(f"Plotting shap dataframes for symbol {symbol}")
        ds = ds_service.get_dataset(name=dataset, symbol=symbol)
        fs = DatasetService.get_feature_selection(ds=ds,
                                                  method='importances_shap',
                                                  target=target)
        X_all = ds_service.get_dataset_features(ds=ds, columns=fs.features)
        y_all = ds_service.get_dataset_target(ds=ds, name=target)
        model = m_service.get_model(pipeline=pipeline,
                                    dataset=dataset,
                                    target=target,
                                    symbol=symbol)
        for t in model.tests:
            os.makedirs(
                f"images/shap-test-hierarchy/{dataset}/{target}/{pipeline}/",
                exist_ok=True)
            placeholder = "{label}"
            csv_name = f"data/shap_values/{dataset}/{target}/{pipeline}/shap_training_window_{symbol}_{placeholder}_Wdays{t.window['days']}_.csv"
            expected_csv_name = csv_name.format(label='SHAP_expected')
            print(f"Loading results for test {t.window}")
            results = ModelService.parse_test_results(test=t)
            exp_shap_df = pd.read_csv(expected_csv_name,
                                      index_col='time',
                                      parse_dates=True)
            for cls, label in enumerate(["SELL", "HOLD", "BUY"]):
                class_csv_name = csv_name.format(label=label)
                cls_shap_df = pd.read_csv(class_csv_name,
                                          index_col='time',
                                          parse_dates=True)
                cls_shap_df = cls_shap_df.loc[t.test_interval.begin:t.
                                              test_interval.end]

                x_train = X_all.loc[cls_shap_df.index]
                chunk_size = int(cls_shap_df.shape[0] / num_shap_plots)

                # fig = plt.figure(constrained_layout=True, figsize=(100, 50), dpi=300) #
                # gs = GridSpec(3, num_shap_plots, figure=fig, wspace=1.5, hspace=0.3)
                # precision_ax = fig.add_subplot(gs[0, :])
                # shap_values_ax = fig.add_subplot(gs[1, :])
                # beeswarms_axs = [fig.add_subplot(gs[2, i]) for i in range(num_shap_plots)]
                # #format_axes(fig)
                # shap_plot_labels = set()
                # for idx, start in enumerate(range(0, cls_shap_df.shape[0], chunk_size)):
                #     end = start + chunk_size
                #     left = cls_shap_df.shape[0] - end
                #     if left > 0 and left < chunk_size:
                #         end += left
                #     elif left < 0:
                #         break
                #     # Plot each section's SHAP values
                #     cdf_subset = cls_shap_df.iloc[start:end]
                #     train_subset = x_train.iloc[start:end]
                #
                #     # Get a rank of feature labels based on this section's shap values
                #     abs_mean_shap = cdf_subset.abs().mean(axis='index')
                #     abs_mean_rank = abs_mean_shap.sort_values(ascending=False)[:shap_show_count]
                #     for l in abs_mean_rank.index:
                #         # Save labels for features in the top-N
                #         shap_plot_labels.add(l)
                #
                #     # Plot this section's SHAP values
                #     plt.sca(beeswarms_axs[idx])
                #     shap.summary_plot(
                #         cdf_subset.values,
                #         train_subset,
                #         max_display=shap_show_count,
                #         show=False,
                #         color_bar=False,
                #         sort=True
                #     )
                #     min_date = cdf_subset.index.min().to_pydatetime().strftime("%Y/%m/%d")
                #     max_date = cdf_subset.index.max().to_pydatetime().strftime("%Y/%m/%d")
                #     beeswarms_axs[idx].set_xlabel(f"SHAP values\n{min_date} - {max_date}", fontsize=8)
                #     beeswarms_axs[idx].tick_params(axis='y', which='major', labelsize=6)
                #     beeswarms_axs[idx].tick_params(axis='x', which='major', labelsize=8)

                # # Plot shap values
                # plot_cls_shap_df = cls_shap_df.abs().rolling(7, min_periods=1).mean()
                # def get_spread(series):
                #     return np.abs(series.max() - series.min())
                # plot_rank = plot_cls_shap_df[list(shap_plot_labels)].apply(get_spread, axis='index').sort_values(ascending=False)[:shap_show_count]
                # plot_cls_shap_df['xlabel'] = [t.to_pydatetime().strftime("%Y/%m/%d") for t in plot_cls_shap_df.index]
                # shap_ax = plot_cls_shap_df.plot(
                #     x='xlabel',
                #     y=[c for c in plot_rank.index],
                #     kind='line',
                #     ax=shap_values_ax,
                #     legend=False,
                #     xlabel=''
                # )
                # patches, labels = shap_ax.get_legend_handles_labels()
                # shap_ax.legend(
                #     patches, labels,
                #     loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6}
                # )
                # shap_ax.tick_params(axis='x', which='major', labelsize=8)
                # shap_ax.set_ylabel('mean(|SHAP|)', fontsize=6)
                # #shap_ax.tick_params(labelbottom=False, labelleft=False)
                #
                # # Get Metrics scores dataframe
                # cri_df = get_metrics_df(results).rolling(7, min_periods=1).mean()
                # cri_df['xlabel'] = [t.to_pydatetime().strftime("%Y/%m/%d") for t in cri_df.index]
                # cri_ax = cri_df.plot(
                #     x='xlabel',
                #     y=f"pre_{cls}",
                #     kind='line',
                #     ax=precision_ax,
                #     legend=False,
                #     xlabel=''
                # )
                # patches, labels = cri_ax.get_legend_handles_labels()
                # cri_ax.legend(
                #     patches, labels,
                #     loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 6}
                # )
                # cri_ax.set_ylabel('mean(precision)', fontsize=6)
                # cri_ax.tick_params(labelbottom=False, labelleft=True)
                #
                # min_date = cri_df.index.min().to_pydatetime().strftime("%Y/%m/%d")
                # max_date = cri_df.index.max().to_pydatetime().strftime("%Y/%m/%d")
                # fig.suptitle(f"{pipeline}, {symbol}, class {label} tests from {min_date} to {max_date}")
                #
                # # fig.show()
                # plt.savefig(
                #     f"images/shap-test/{pipeline}_{dataset}_{target}_{symbol}_{label}.png",
                #     dpi='figure'
                # )
                # plt.close()
                print(f"{label} OK")

            print(f"Exported symbol {symbol}.")
            # # Load day estimator
            # est = load_estimator()

        print(f"Plotted {symbol}")