示例#1
0
def test_cust_grid_outliers_endpoint(titanic_data, titanic_target):
    """
    show_outliers with custom_grid_points and endpoint==False
    """
    fig, axes, summary_df = target_plot(df=titanic_data,
                                        feature='Fare',
                                        feature_name='Fare',
                                        target=titanic_target,
                                        cust_grid_points=range(0, 100, 10),
                                        show_outliers=True,
                                        endpoint=False)

    expected = pd.DataFrame(
        {'x': {0: 0, 8: 8, 9: 9},
         'display_column': {0: '[0, 10)', 8: '[80, 90)',
                            9: '>= 90'},
         'value_lower': {0: 0.0, 8: 80.0, 9: 90.0},
         'value_upper': {0: 10.0, 8: 90.0, 9: np.nan},
         'count': {0: 336, 8: 15, 9: 61},
         'Survived': {0: 0.19940476190476192,
                      8: 0.8666666666666667,
                      9: 0.7540983606557377}}
    )

    assert_frame_equal(expected, summary_df.loc[[0, 8, 9], :], check_like=True)
    assert len(summary_df) == 10
示例#2
0
def test_percentile_range_outliers_endpoint(titanic_data, titanic_target):
    """
    show_outliers with percentile_range defined and endpoint==False
    """
    fig, axes, summary_df = target_plot(df=titanic_data,
                                        feature='Fare',
                                        feature_name='Fare',
                                        target=titanic_target,
                                        percentile_range=(5, 95),
                                        show_outliers=True,
                                        endpoint=False)

    expected = pd.DataFrame(
        {'x': {0: 0, 9: 9, 10: 10},
         'display_column': {0: '< 7.22', 9: '[56.5, 112.08)', 10: '>= 112.08'},
         'value_lower': {0: np.nan, 9: 56.4958, 10: 112.07915},
         'value_upper': {0: 7.225, 9: 112.07915, 10: np.nan},
         'count': {0: 43, 9: 91, 10: 45},
         'Survived': {0: 0.06976744186046512,
                      9: 0.6593406593406593,
                      10: 0.7555555555555555}}
    )

    assert_frame_equal(expected, summary_df.loc[[0, 9, 10], :], check_like=True)
    assert len(summary_df) == 11
示例#3
0
def test_cust_grid_outliers(titanic_data, titanic_target):
    """
    show_outliers with custom_grid_points
    """
    fig, axes, summary_df = target_plot(df=titanic_data,
                                        feature='Fare',
                                        feature_name='Fare',
                                        target=titanic_target,
                                        cust_grid_points=range(0, 100, 10),
                                        show_outliers=True)

    expected = pd.DataFrame(
        {'x': {0: 0, 8: 8, 9: 9},
         'display_column': {0: '[0, 10)', 8: '[80, 90]',
                            9: '> 90'},
         'value_lower': {0: 0.0, 8: 80.0, 9: 90.0},
         'value_upper': {0: 10.0, 8: 90.0, 9: np.nan},
         'count': {0: 336, 8: 19, 9: 57},
         'Survived': {0: 0.19940476190476192,
                      8: 0.8421052631578947,
                      9: 0.7543859649122807}}
    )

    assert_frame_equal(expected, summary_df.loc[[0, 8, 9], :], check_like=True)
    assert len(summary_df) == 10
def targetDistribution(data, featureToExamine, resultColumnName):
    fig, axes, summary_df = info_plots.target_plot(
        df=data,
        feature=featureToExamine,
        feature_name=featureToExamine,
        target=resultColumnName)
    save("targetDistribution", fig=fig, plt=plt)
示例#5
0
def test_grid_range_outliers_endpoint(titanic_data, titanic_target):
    """
    show_outliers with grid_range defined and endpoint==False
    grid_range, need to set grid_type='equal'
    """
    fig, axes, summary_df = target_plot(df=titanic_data,
                                        feature='Fare',
                                        feature_name='Fare',
                                        target=titanic_target,
                                        grid_range=(0, 100),
                                        grid_type='equal',
                                        show_outliers=True,
                                        endpoint=False)

    expected = pd.DataFrame(
        {'x': {0: 0, 8: 8, 9: 9},
         'display_column': {0: '[0, 11.11)', 8: '[88.89, 100)',
                            9: '>= 100'},
         'value_lower': {0: 0.0, 8: 88.88888888888889, 9: 100.0},
         'value_upper': {0: 11.11111111111111, 8: 100.0, 9: np.nan},
         'count': {0: 364, 8: 10, 9: 53},
         'Survived': {0: 0.2087912087912088, 8: 0.9, 9: 0.7358490566037735}}
    )

    assert_frame_equal(expected, summary_df.loc[[0, 8, 9], :], check_like=True)
    assert len(summary_df) == 10
示例#6
0
def test_onehot(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature=["Embarked_C", "Embarked_Q", "Embarked_S"],
        feature_name="Embarked",
        target=titanic_target,
    )
示例#7
0
def target_plot(X,Y,features_to_plot,labels,grid_range=None):
    from pdpbox import info_plots

    figs = list()
    axs  = list()

    df = pd.concat([X, Y], axis=1, join_axes=[X.index])
    for feature in features_to_plot:
        if(grid_range is None):
            fig, ax, summary_df = info_plots.target_plot(df,feature=feature,feature_name=feature,target=labels,grid_type='equal')
        else:
            fig, ax, summary_df = info_plots.target_plot(df,feature=feature,feature_name=feature,target=labels,grid_type='equal',
                    show_outliers='True',grid_range=grid_range)
        figs.append(fig)
        axs.append(ax)

    return figs, axs
示例#8
0
    def info_target_plot(self, feature, sample = 10000, target = None, grid_type = 'percentile', **kargs):
        fig, axes, result = info_plots.target_plot(
                df=self.sample(sample), feature=feature, feature_name=feature, 
                target=target or self.target, grid_type = grid_type, **kargs)
        self.info_target_data =  ResultDF(result, 'count')

        _ = axes['bar_ax'].set_xticklabels(self.summary['info_target'].display_column.values)
        plt.show()    
示例#9
0
def test_show_percentile(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        show_percentile=True,
    )
    assert len(summary_df) == 9
示例#10
0
def test_cust_grid_points(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        cust_grid_points=range(0, 100, 10),
    )
    assert len(summary_df) == 9
示例#11
0
def test_percentile_range(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        percentile_range=(5, 95),
    )
    assert len(summary_df) == 9
示例#12
0
def test_num_grid_points(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        num_grid_points=20,
    )
    assert len(summary_df) == 19
示例#13
0
def test_grid_type(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        grid_type="equal",
    )
    assert len(summary_df) == 9
示例#14
0
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 59
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)[0]
        # load data
        training_df, test_df = get_encoded_logs(job)
        training_df['label'] = training_df['label'].astype(bool).astype(int)
        columns = list(training_df.columns.values)
        features = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)
        feature = 'Age_1'
        feature_grids, percentile_info = _get_grids(
            feature_values=training_df[feature].values,
            num_grid_points=10,
            grid_type=None,
            percentile_range='percentile',
            grid_range=None)
        custom_grids = []
        indexs = []
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)):
            custom_grids.append(x)
        print(features)
        fig, axes, summary_df = info_plots.target_plot(
            df=training_df,
            feature=feature,
            feature_name='feature value',
            cust_grid_points=custom_grids,
            target='label',
            show_percentile=False)
        fig.savefig('ice_plot_train_1_3_CType.png')

        lists = list(training_df[feature].values)
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)):
            indexs.append(lists.index(x))
        encoder = retrieve_proper_encoder(job)
        encoder.decode(training_df, job.encoding)
        values = training_df[feature].values
        training_df
        lst = []
        print(summary_df)
        if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value:
            for x in range(len(indexs) - 1):
                lst.append({
                    'value': values[indexs[x]],
                    'label': summary_df['label'][x],
                    'count': summary_df['count'][x],
                })
        else:
            for x in range(summary_df.shape[0]):
                lst.append({
                    'value': summary_df['display_column'][x],
                    'label': summary_df['label'][x],
                    'count': summary_df['count'][x],
                })
        print(lst)
示例#15
0
def plot_pdp_cat(df, features, target):
  for col in features:
      fig, axes, summary_df = info_plots.target_plot(
          df=df, 
          feature=[sub_col for sub_col in df.columns if sub_col.startswith(col)], 
          feature_name=col, target=target)
      _ = axes['bar_ax'].set_xticklabels([sub_col.replace(col+'_','') for sub_col in df.columns if sub_col.startswith(col)], 
                                         rotation=45,
                                         fontsize=12,
                                         horizontalalignment='right')
def targetDistributionNumericFeature(data,
                                     featureToExamine,
                                     resultColumnName,
                                     show_percentile=True):
    fig, axes, summary_df = info_plots.target_plot(
        df=data,
        feature=featureToExamine,
        feature_name=featureToExamine,
        target=resultColumnName,
        show_percentile=show_percentile)
    save("targetDistributionNumericFeature", plt=plt, fig=fig)
示例#17
0
def plot_pdp_cont(df, features, target):
  for i in features:
      fig, axes, summary_df = info_plots.target_plot(
              df=df, 
              feature=i, 
              feature_name=i, 
              target=target,
              grid_type="percentile",
              show_percentile=True
              ,num_grid_points=21
      )
示例#18
0
def test_endpoint(titanic_data, titanic_target):
    """
    test endpoint==False (last point should't be included)
    """
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        endpoint=False,
    )
    assert len(summary_df) == 10
示例#19
0
def test_grid_range(titanic_data, titanic_target):
    """
    grid_range, need to set grid_type='equal'
    """
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        grid_type="equal",
        grid_range=(5, 100),
    )
    assert len(summary_df) == 9
示例#20
0
def test_percentile_range_outliers(titanic_data, titanic_target):
    """
    show_outliers with percentile_range defined
    """
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        percentile_range=(5, 95),
        show_outliers=True,
    )
    assert len(summary_df) == 11
示例#21
0
def test_cust_grid_outliers(titanic_data, titanic_target):
    """
    show_outliers with custom_grid_points
    """
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        cust_grid_points=range(0, 100, 10),
        show_outliers=True,
    )
    assert len(summary_df) == 10
示例#22
0
def test_binary(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(df=titanic_data,
                                        feature='Sex',
                                        feature_name='Sex',
                                        target=titanic_target)

    expected = pd.DataFrame(
        {'x': {0: 0, 1: 1},
         'display_column': {0: 'Sex_0', 1: 'Sex_1'},
         'count': {0: 314, 1: 577},
         'Survived': {0: 0.7420382165605095, 1: 0.18890814558058924}}
    )

    assert_frame_equal(expected, summary_df, check_like=True)
示例#23
0
def explain(ice_exp: Explanation, training_df, test_df, explanation_target,
            prefix_target):
    job = ice_exp.job
    training_df = training_df.drop(['trace_id'], 1)
    if job.encoding.value_encoding == ValueEncodings.BOOLEAN.value:
        training_df['label'] = training_df['label'].astype(bool).astype(
            int) + 1

    feature_grids, percentile_info = _get_grids(
        feature_values=training_df[explanation_target].values,
        num_grid_points=10,
        grid_type=None,
        percentile_range='percentile',
        grid_range=None)
    custom_grids = [
        x
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1))
    ]

    fig, axes, summary_df = info_plots.target_plot(
        df=training_df,
        feature=explanation_target,
        feature_name='feature value',
        cust_grid_points=custom_grids,
        target='label',
        show_percentile=False)
    lists = list(training_df[explanation_target].values)
    indexs = [
        lists.index(x)
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1))
    ]
    encoder = retrieve_proper_encoder(job)
    encoder.decode(training_df, job.encoding)
    values = training_df[explanation_target].values
    lst = []
    if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value:
        for x in range(len(indexs) - 1):
            lst.append({
                'value': values[indexs[x]],
                'label': summary_df['label'][x],
                'count': int(summary_df['count'][x]),
            })
    else:
        for x in range(summary_df.shape[0]):
            lst.append({
                'value': summary_df['display_column'][x],
                'label': summary_df['label'][x],
                'count': int(summary_df['count'][x]),
            })
    return lst
示例#24
0
文件: PDP.py 项目: kje980714/XAI
    def plot(self, i):
        fig, _, _ = info_plots.target_plot(df=self.boston_data,
                                           feature=self.cols[i],
                                           feature_name=self.cols[i],
                                           target='MEDV')

        img = BytesIO()
        plt.savefig(img, format='png', dpi=200)
        plt.clf()
        plt.cla()
        plt.close()

        img.seek(0)

        return img
示例#25
0
def test_grid_range_outliers(titanic_data, titanic_target):
    """
    show_outliers with grid_range defined
    grid_range, need to set grid_type='equal'
    """
    fig, axes, summary_df = target_plot(
        df=titanic_data,
        feature="Fare",
        feature_name="Fare",
        target=titanic_target,
        grid_range=(0, 100),
        grid_type="equal",
        show_outliers=True,
    )
    assert len(summary_df) == 10
示例#26
0
def test_onehot(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(df=titanic_data,
                                        feature=['Embarked_C', 'Embarked_Q',
                                                 'Embarked_S'],
                                        feature_name='Embarked',
                                        target=titanic_target)

    expected = pd.DataFrame(
        {'x': {0: 0, 1: 1, 2: 2},
         'display_column': {0: 'Embarked_C', 1: 'Embarked_Q', 2: 'Embarked_S'},
         'count': {0: 168, 1: 77, 2: 646},
         'Survived': {0: 0.5535714285714286,
                      1: 0.38961038961038963,
                      2: 0.33900928792569657}}
    )

    assert_frame_equal(expected, summary_df, check_like=True)
示例#27
0
def target_plot(df,
                feature,
                feature_name,
                target,
                num_grid_points=10,
                xticklabels=None,
                show_percentile=False):
    """Wrapper for info_plots.target_plot."""
    fig, axes, summary_df = info_plots.target_plot(
        df=df,
        feature=feature,
        feature_name=feature_name,
        target=target,
        num_grid_points=num_grid_points,
        show_percentile=show_percentile,
    )

    if xticklabels is not None:
        _ = axes["bar_ax"].set_xticklabels(xticklabels)
    return fig, summary_df
示例#28
0
def test_numeric(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(df=titanic_data,
                                        feature='Fare',
                                        feature_name='Fare',
                                        target=titanic_target)

    expected = pd.DataFrame(
        {'x': {0: 0, 4: 4, 7: 7},
         'display_column': {0: '[0, 7.73)',
                            4: '[13, 16.7)',
                            7: '[35.11, 73.5)'},
         'value_lower': {0: 0.0, 4: 13.0, 7: 35.111111111111086},
         'value_upper': {0: 7.732844444444444, 4: 16.7, 7: 73.5},
         'count': {0: 99, 4: 108, 7: 96},
         'Survived': {0: 0.1414141414141414,
                      4: 0.37037037037037035,
                      7: 0.5104166666666666}}
    )

    assert_frame_equal(expected, summary_df.loc[[0, 4, 7], :], check_like=True)
    assert len(summary_df) == 9
示例#29
0
def test_num_grid_points(titanic_data, titanic_target):
    fig, axes, summary_df = target_plot(df=titanic_data,
                                        feature='Fare',
                                        feature_name='Fare',
                                        target=titanic_target,
                                        num_grid_points=20)

    expected = pd.DataFrame(
        {'x': {0: 0, 9: 9, 18: 18},
         'display_column': {0: '[0, 7.22)',
                            9: '[13, 15.5)',
                            18: '[110.88, 512.33]'},
         'value_lower': {0: 0.0, 9: 13.0, 18: 110.8833},
         'value_upper': {0: 7.225, 9: 15.5, 18: 512.3292},
         'count': {0: 43, 9: 80, 18: 49},
         'Survived': {0: 0.06976744186046512, 9: 0.3375,
                      18: 0.7551020408163265}}
    )

    assert_frame_equal(expected, summary_df.loc[[0, 9, 18], :], check_like=True)
    assert len(summary_df) == 19
示例#30
0
def test_endpoint(titanic_data, titanic_target):
    """
    test endpoint==False (last point should't be included)
    """
    fig, axes, summary_df = target_plot(df=titanic_data,
                                        feature='Fare',
                                        feature_name='Fare',
                                        target=titanic_target,
                                        endpoint=False)

    expected = pd.DataFrame(
        {'x': {0: 0, 8: 8, 9: 9},
         'display_column': {0: '[0, 7.73)', 8: '[73.5, 512.33)',
                            9: '>= 512.33'},
         'value_lower': {0: 0.0, 8: 73.5, 9: 512.3292},
         'value_upper': {0: 7.732844444444444, 8: 512.3292, 9: np.nan},
         'count': {0: 99, 8: 99, 9: 3},
         'Survived': {0: 0.1414141414141414, 8: 0.7171717171717171, 9: 1.0}}
    )

    assert_frame_equal(expected, summary_df.loc[[0, 8, 9], :], check_like=True)
    assert len(summary_df) == 10