def test_cust_grid_outliers_endpoint(titanic_data, titanic_target): """ show_outliers with custom_grid_points and endpoint==False """ fig, axes, summary_df = target_plot(df=titanic_data, feature='Fare', feature_name='Fare', target=titanic_target, cust_grid_points=range(0, 100, 10), show_outliers=True, endpoint=False) expected = pd.DataFrame( {'x': {0: 0, 8: 8, 9: 9}, 'display_column': {0: '[0, 10)', 8: '[80, 90)', 9: '>= 90'}, 'value_lower': {0: 0.0, 8: 80.0, 9: 90.0}, 'value_upper': {0: 10.0, 8: 90.0, 9: np.nan}, 'count': {0: 336, 8: 15, 9: 61}, 'Survived': {0: 0.19940476190476192, 8: 0.8666666666666667, 9: 0.7540983606557377}} ) assert_frame_equal(expected, summary_df.loc[[0, 8, 9], :], check_like=True) assert len(summary_df) == 10
def test_percentile_range_outliers_endpoint(titanic_data, titanic_target): """ show_outliers with percentile_range defined and endpoint==False """ fig, axes, summary_df = target_plot(df=titanic_data, feature='Fare', feature_name='Fare', target=titanic_target, percentile_range=(5, 95), show_outliers=True, endpoint=False) expected = pd.DataFrame( {'x': {0: 0, 9: 9, 10: 10}, 'display_column': {0: '< 7.22', 9: '[56.5, 112.08)', 10: '>= 112.08'}, 'value_lower': {0: np.nan, 9: 56.4958, 10: 112.07915}, 'value_upper': {0: 7.225, 9: 112.07915, 10: np.nan}, 'count': {0: 43, 9: 91, 10: 45}, 'Survived': {0: 0.06976744186046512, 9: 0.6593406593406593, 10: 0.7555555555555555}} ) assert_frame_equal(expected, summary_df.loc[[0, 9, 10], :], check_like=True) assert len(summary_df) == 11
def test_cust_grid_outliers(titanic_data, titanic_target): """ show_outliers with custom_grid_points """ fig, axes, summary_df = target_plot(df=titanic_data, feature='Fare', feature_name='Fare', target=titanic_target, cust_grid_points=range(0, 100, 10), show_outliers=True) expected = pd.DataFrame( {'x': {0: 0, 8: 8, 9: 9}, 'display_column': {0: '[0, 10)', 8: '[80, 90]', 9: '> 90'}, 'value_lower': {0: 0.0, 8: 80.0, 9: 90.0}, 'value_upper': {0: 10.0, 8: 90.0, 9: np.nan}, 'count': {0: 336, 8: 19, 9: 57}, 'Survived': {0: 0.19940476190476192, 8: 0.8421052631578947, 9: 0.7543859649122807}} ) assert_frame_equal(expected, summary_df.loc[[0, 8, 9], :], check_like=True) assert len(summary_df) == 10
def targetDistribution(data, featureToExamine, resultColumnName): fig, axes, summary_df = info_plots.target_plot( df=data, feature=featureToExamine, feature_name=featureToExamine, target=resultColumnName) save("targetDistribution", fig=fig, plt=plt)
def test_grid_range_outliers_endpoint(titanic_data, titanic_target): """ show_outliers with grid_range defined and endpoint==False grid_range, need to set grid_type='equal' """ fig, axes, summary_df = target_plot(df=titanic_data, feature='Fare', feature_name='Fare', target=titanic_target, grid_range=(0, 100), grid_type='equal', show_outliers=True, endpoint=False) expected = pd.DataFrame( {'x': {0: 0, 8: 8, 9: 9}, 'display_column': {0: '[0, 11.11)', 8: '[88.89, 100)', 9: '>= 100'}, 'value_lower': {0: 0.0, 8: 88.88888888888889, 9: 100.0}, 'value_upper': {0: 11.11111111111111, 8: 100.0, 9: np.nan}, 'count': {0: 364, 8: 10, 9: 53}, 'Survived': {0: 0.2087912087912088, 8: 0.9, 9: 0.7358490566037735}} ) assert_frame_equal(expected, summary_df.loc[[0, 8, 9], :], check_like=True) assert len(summary_df) == 10
def test_onehot(titanic_data, titanic_target): fig, axes, summary_df = target_plot( df=titanic_data, feature=["Embarked_C", "Embarked_Q", "Embarked_S"], feature_name="Embarked", target=titanic_target, )
def target_plot(X,Y,features_to_plot,labels,grid_range=None): from pdpbox import info_plots figs = list() axs = list() df = pd.concat([X, Y], axis=1, join_axes=[X.index]) for feature in features_to_plot: if(grid_range is None): fig, ax, summary_df = info_plots.target_plot(df,feature=feature,feature_name=feature,target=labels,grid_type='equal') else: fig, ax, summary_df = info_plots.target_plot(df,feature=feature,feature_name=feature,target=labels,grid_type='equal', show_outliers='True',grid_range=grid_range) figs.append(fig) axs.append(ax) return figs, axs
def info_target_plot(self, feature, sample = 10000, target = None, grid_type = 'percentile', **kargs): fig, axes, result = info_plots.target_plot( df=self.sample(sample), feature=feature, feature_name=feature, target=target or self.target, grid_type = grid_type, **kargs) self.info_target_data = ResultDF(result, 'count') _ = axes['bar_ax'].set_xticklabels(self.summary['info_target'].display_column.values) plt.show()
def test_show_percentile(titanic_data, titanic_target): fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, show_percentile=True, ) assert len(summary_df) == 9
def test_cust_grid_points(titanic_data, titanic_target): fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, cust_grid_points=range(0, 100, 10), ) assert len(summary_df) == 9
def test_percentile_range(titanic_data, titanic_target): fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, percentile_range=(5, 95), ) assert len(summary_df) == 9
def test_num_grid_points(titanic_data, titanic_target): fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, num_grid_points=20, ) assert len(summary_df) == 19
def test_grid_type(titanic_data, titanic_target): fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, grid_type="equal", ) assert len(summary_df) == 9
def handle(self, *args, **kwargs): # get model TARGET_MODEL = 59 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path)[0] # load data training_df, test_df = get_encoded_logs(job) training_df['label'] = training_df['label'].astype(bool).astype(int) columns = list(training_df.columns.values) features = list( training_df.drop(['trace_id', 'label'], 1).columns.values) feature = 'Age_1' feature_grids, percentile_info = _get_grids( feature_values=training_df[feature].values, num_grid_points=10, grid_type=None, percentile_range='percentile', grid_range=None) custom_grids = [] indexs = [] for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)): custom_grids.append(x) print(features) fig, axes, summary_df = info_plots.target_plot( df=training_df, feature=feature, feature_name='feature value', cust_grid_points=custom_grids, target='label', show_percentile=False) fig.savefig('ice_plot_train_1_3_CType.png') lists = list(training_df[feature].values) for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)): indexs.append(lists.index(x)) encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) values = training_df[feature].values training_df lst = [] print(summary_df) if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value: for x in range(len(indexs) - 1): lst.append({ 'value': values[indexs[x]], 'label': summary_df['label'][x], 'count': summary_df['count'][x], }) else: for x in range(summary_df.shape[0]): lst.append({ 'value': summary_df['display_column'][x], 'label': summary_df['label'][x], 'count': summary_df['count'][x], }) print(lst)
def plot_pdp_cat(df, features, target): for col in features: fig, axes, summary_df = info_plots.target_plot( df=df, feature=[sub_col for sub_col in df.columns if sub_col.startswith(col)], feature_name=col, target=target) _ = axes['bar_ax'].set_xticklabels([sub_col.replace(col+'_','') for sub_col in df.columns if sub_col.startswith(col)], rotation=45, fontsize=12, horizontalalignment='right')
def targetDistributionNumericFeature(data, featureToExamine, resultColumnName, show_percentile=True): fig, axes, summary_df = info_plots.target_plot( df=data, feature=featureToExamine, feature_name=featureToExamine, target=resultColumnName, show_percentile=show_percentile) save("targetDistributionNumericFeature", plt=plt, fig=fig)
def plot_pdp_cont(df, features, target): for i in features: fig, axes, summary_df = info_plots.target_plot( df=df, feature=i, feature_name=i, target=target, grid_type="percentile", show_percentile=True ,num_grid_points=21 )
def test_endpoint(titanic_data, titanic_target): """ test endpoint==False (last point should't be included) """ fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, endpoint=False, ) assert len(summary_df) == 10
def test_grid_range(titanic_data, titanic_target): """ grid_range, need to set grid_type='equal' """ fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, grid_type="equal", grid_range=(5, 100), ) assert len(summary_df) == 9
def test_percentile_range_outliers(titanic_data, titanic_target): """ show_outliers with percentile_range defined """ fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, percentile_range=(5, 95), show_outliers=True, ) assert len(summary_df) == 11
def test_cust_grid_outliers(titanic_data, titanic_target): """ show_outliers with custom_grid_points """ fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, cust_grid_points=range(0, 100, 10), show_outliers=True, ) assert len(summary_df) == 10
def test_binary(titanic_data, titanic_target): fig, axes, summary_df = target_plot(df=titanic_data, feature='Sex', feature_name='Sex', target=titanic_target) expected = pd.DataFrame( {'x': {0: 0, 1: 1}, 'display_column': {0: 'Sex_0', 1: 'Sex_1'}, 'count': {0: 314, 1: 577}, 'Survived': {0: 0.7420382165605095, 1: 0.18890814558058924}} ) assert_frame_equal(expected, summary_df, check_like=True)
def explain(ice_exp: Explanation, training_df, test_df, explanation_target, prefix_target): job = ice_exp.job training_df = training_df.drop(['trace_id'], 1) if job.encoding.value_encoding == ValueEncodings.BOOLEAN.value: training_df['label'] = training_df['label'].astype(bool).astype( int) + 1 feature_grids, percentile_info = _get_grids( feature_values=training_df[explanation_target].values, num_grid_points=10, grid_type=None, percentile_range='percentile', grid_range=None) custom_grids = [ x for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)) ] fig, axes, summary_df = info_plots.target_plot( df=training_df, feature=explanation_target, feature_name='feature value', cust_grid_points=custom_grids, target='label', show_percentile=False) lists = list(training_df[explanation_target].values) indexs = [ lists.index(x) for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)) ] encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) values = training_df[explanation_target].values lst = [] if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value: for x in range(len(indexs) - 1): lst.append({ 'value': values[indexs[x]], 'label': summary_df['label'][x], 'count': int(summary_df['count'][x]), }) else: for x in range(summary_df.shape[0]): lst.append({ 'value': summary_df['display_column'][x], 'label': summary_df['label'][x], 'count': int(summary_df['count'][x]), }) return lst
def plot(self, i): fig, _, _ = info_plots.target_plot(df=self.boston_data, feature=self.cols[i], feature_name=self.cols[i], target='MEDV') img = BytesIO() plt.savefig(img, format='png', dpi=200) plt.clf() plt.cla() plt.close() img.seek(0) return img
def test_grid_range_outliers(titanic_data, titanic_target): """ show_outliers with grid_range defined grid_range, need to set grid_type='equal' """ fig, axes, summary_df = target_plot( df=titanic_data, feature="Fare", feature_name="Fare", target=titanic_target, grid_range=(0, 100), grid_type="equal", show_outliers=True, ) assert len(summary_df) == 10
def test_onehot(titanic_data, titanic_target): fig, axes, summary_df = target_plot(df=titanic_data, feature=['Embarked_C', 'Embarked_Q', 'Embarked_S'], feature_name='Embarked', target=titanic_target) expected = pd.DataFrame( {'x': {0: 0, 1: 1, 2: 2}, 'display_column': {0: 'Embarked_C', 1: 'Embarked_Q', 2: 'Embarked_S'}, 'count': {0: 168, 1: 77, 2: 646}, 'Survived': {0: 0.5535714285714286, 1: 0.38961038961038963, 2: 0.33900928792569657}} ) assert_frame_equal(expected, summary_df, check_like=True)
def target_plot(df, feature, feature_name, target, num_grid_points=10, xticklabels=None, show_percentile=False): """Wrapper for info_plots.target_plot.""" fig, axes, summary_df = info_plots.target_plot( df=df, feature=feature, feature_name=feature_name, target=target, num_grid_points=num_grid_points, show_percentile=show_percentile, ) if xticklabels is not None: _ = axes["bar_ax"].set_xticklabels(xticklabels) return fig, summary_df
def test_numeric(titanic_data, titanic_target): fig, axes, summary_df = target_plot(df=titanic_data, feature='Fare', feature_name='Fare', target=titanic_target) expected = pd.DataFrame( {'x': {0: 0, 4: 4, 7: 7}, 'display_column': {0: '[0, 7.73)', 4: '[13, 16.7)', 7: '[35.11, 73.5)'}, 'value_lower': {0: 0.0, 4: 13.0, 7: 35.111111111111086}, 'value_upper': {0: 7.732844444444444, 4: 16.7, 7: 73.5}, 'count': {0: 99, 4: 108, 7: 96}, 'Survived': {0: 0.1414141414141414, 4: 0.37037037037037035, 7: 0.5104166666666666}} ) assert_frame_equal(expected, summary_df.loc[[0, 4, 7], :], check_like=True) assert len(summary_df) == 9
def test_num_grid_points(titanic_data, titanic_target): fig, axes, summary_df = target_plot(df=titanic_data, feature='Fare', feature_name='Fare', target=titanic_target, num_grid_points=20) expected = pd.DataFrame( {'x': {0: 0, 9: 9, 18: 18}, 'display_column': {0: '[0, 7.22)', 9: '[13, 15.5)', 18: '[110.88, 512.33]'}, 'value_lower': {0: 0.0, 9: 13.0, 18: 110.8833}, 'value_upper': {0: 7.225, 9: 15.5, 18: 512.3292}, 'count': {0: 43, 9: 80, 18: 49}, 'Survived': {0: 0.06976744186046512, 9: 0.3375, 18: 0.7551020408163265}} ) assert_frame_equal(expected, summary_df.loc[[0, 9, 18], :], check_like=True) assert len(summary_df) == 19
def test_endpoint(titanic_data, titanic_target): """ test endpoint==False (last point should't be included) """ fig, axes, summary_df = target_plot(df=titanic_data, feature='Fare', feature_name='Fare', target=titanic_target, endpoint=False) expected = pd.DataFrame( {'x': {0: 0, 8: 8, 9: 9}, 'display_column': {0: '[0, 7.73)', 8: '[73.5, 512.33)', 9: '>= 512.33'}, 'value_lower': {0: 0.0, 8: 73.5, 9: 512.3292}, 'value_upper': {0: 7.732844444444444, 8: 512.3292, 9: np.nan}, 'count': {0: 99, 8: 99, 9: 3}, 'Survived': {0: 0.1414141414141414, 8: 0.7171717171717171, 9: 1.0}} ) assert_frame_equal(expected, summary_df.loc[[0, 8, 9], :], check_like=True) assert len(summary_df) == 10