def plot(self,rot=0,close=True,show_last=True,figsize=(18,8)): ''' 可视化每个离散变量的连续化值(直方图),可以看离散变量在每个类别中的概率值或WOE分布情况。 rot:文字旋转角度。 返回字典:键为变量名(对于单变量,默认为空字符串''),值为图片对象。 ''' result={} if self.single is True: data=pd.Series(self.maps) data.index.name='' data.name='' data=data.reindex(utils.sort(data.index.tolist(),ascending=True,pattern='\((.*?),',converter=float)) fig=plt.figure(figsize=figsize) ax=fig.add_subplot(111) data.plot(kind='bar',rot=rot,ax=ax) ax.set_title(self.method) result['']=fig if close: plt.close('all') else: for feature in self.maps: data=pd.Series(self.maps[feature]) data.index.name='' data.name='' data=data.reindex(utils.sort(data.index.tolist(),ascending=True,pattern='\((.*?),',converter=float)) fig=plt.figure(figsize=figsize) ax=fig.add_subplot(111) data.plot(kind='bar',rot=rot,ax=ax) ax.set_title('%s : %s'%(self.method,feature)) result[feature]=fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return result
def plot(self, rot=0, pattern='\((.*?),', str_nopattern=None, close=True, show_last=True, figsize=(18, 8), xlabel='Discrete Values', ylabel='Continuous Values', fontsize=12): ''' 可视化每个离散变量的连续化值(直方图),可以看离散变量在每个类别中的概率值或WOE分布情况。 rot:文字旋转角度。 pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序。 str_nopattern: 若self.single为True,则为列表,表示未匹配pattern字符串的正常顺序; 若self.single为True,则为字典,键为变量名(或变量位置),值为列表,含义同上。 返回字典:键为变量名(对于单变量,默认为空字符串''),值为图片对象。 ''' result = {} if self.single is True: data = pd.Series(self.maps) data.index.name = '' data.name = '' data = data.reindex( utils.sort(data.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern, converter=float)) fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) data.plot(kind='bar', rot=rot, ax=ax, fontsize=fontsize) ax.set_xlabel(xlabel, fontsize=fontsize) ax.set_ylabel(ylabel, fontsize=fontsize) ax.set_title(self.method, fontsize=fontsize) result[''] = fig if close: plt.close('all') else: if str_nopattern is None: str_nopattern = {} for feature in self.maps: data = pd.Series(self.maps[feature]) data.index.name = '' data.name = '' data = data.reindex( utils.sort(data.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(feature, None), converter=float)) fig = plt.figure(figsize=figsize) ax = fig.add_subplot(111) data.plot(kind='bar', rot=rot, ax=ax, fontsize=fontsize) ax.set_xlabel(xlabel, fontsize=fontsize) ax.set_ylabel(ylabel, fontsize=fontsize) ax.set_title('%s : %s' % (self.method, feature), fontsize=fontsize) result[feature] = fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return result
def plot_doubleXY_Mean(X, cols_h=None, cols_v=None, Y_cont=None, Y_cate=None, feature_cate=None, backend='seaborn', figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 两个变量X与Y(可以多个)的分析图。0-1离散型Y创建1的占比热力图;连续型Y创建均值热力图。本质上都是均值热力图。 输入值: X: 原始数据,dataframe类型 cols_h: 水平轴选取字段,list类型,默认为data的所有列 cols_v: 垂直轴选取字段,list类型,默认为data的所有列 Y_cont: 连续型Y值,Series或一维np.array或DataFrame Y_cate: 0-1离散型Y值(暂时只能支持两类,且数值为0和1),Series或一维np.array或DataFrame feature_cate: 离散型X变量字段,list类型,默认为空 backend: 画图后端,可选{'seaborn','matplotlib'} close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志。 输出值: fig_dict: X~Y关系图字典;键为二元组,第一个元素为水平轴字段名,第二个元素为垂直轴字段名,如('x1','x2');值为热力图对象 ''' data = X.copy() if cols_v is None: cols_v = list(data.columns) if cols_h is None: cols_h = list(data.columns) if feature_cate is None: feature_cate = [] #先对连续型变量离散化 feature_cont = [col for col in cols_v + cols_h if col not in feature_cate] clf = discretize.QuantileDiscretizer( feature_names=feature_cont, quantiles=[10 * i for i in range(1, 10)], return_numeric=False, fill_na='missing') data = clf.fit_transform(data) if (Y_cont is None) and (Y_cate is None): raise Exception('Y值未给定!') if (Y_cont is None) and (Y_cate is None): raise Exception('连续型和离散型Y值只能给定一种!') if Y_cate is not None: Y = pd.DataFrame(Y_cate) else: Y = pd.DataFrame(Y_cont) fig_dict = {} n = Y.shape[1] cols_Y = list(Y.columns) cols_Y.sort() for vcol in cols_v: for hcol in cols_h: if verbose: print(vcol, hcol) if (vcol == hcol) or (vcol, hcol) in fig_dict.keys(): continue fig, axes = plt.subplots(n, 1, figsize=figsize) if n == 1: axes = np.array([axes]) for i, col in enumerate(cols_Y): value = Y[col].groupby([data[hcol], data[vcol]]).mean().unstack(hcol) if backend == 'seaborn': value = value.reindex_axis(utils.sort(value.index.tolist(), ascending=False, pattern='\((.*?),', converter=float), axis=0) else: value = value.reindex_axis(utils.sort(value.index.tolist(), ascending=True, pattern='\((.*?),', converter=float), axis=0) value = value.reindex_axis(utils.sort(value.columns.tolist(), ascending=True, pattern='\((.*?),', converter=float), axis=1) value = value.fillna(0) if i == 0: title = 'Horizontal: %s <---> Vertical: %s\n%s' % ( hcol, vcol, col) else: title = col if backend == 'seaborn': if Y_cate is not None: sns.heatmap(value, ax=axes[i], annot=True, fmt='.2%') else: sns.heatmap(value, ax=axes[i], annot=True, fmt='g') axes[i].set_title(title) axes[i].set_xlabel('') axes[i].set_ylabel('') else: pc, _ = heatmap(value, ax=axes[i], xlabel='', ylabel='', xticklabels=value.columns, yticklabels=value.index, title=title) if backend != 'seaborn': plt.colorbar(pc, ax=axes.ravel().tolist()) plt.xticks(rotation=30) plt.yticks(rotation=30) fig_dict[(hcol, vcol)] = fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return fig_dict
def plot_singleXY_Mean(X, Y, cols=None, feature_cate=None, normalize=True, figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 单一X与多个Y的分析图(Y均值)。0-1离散型Y创建数量和每个X类别中的1占比柱形图;连续型Y创建数量和Y均值柱形图。本质上都是均值柱形图。 输入值: X: 原始数据,dataframe类型 Y: 连续型或0-1离散型Y值,Series或一维np.array或DataFrame cols: 选取字段,list类型,默认为rawdata的所有列 feature_cate: 离散型X变量字段,list类型,默认为空 normalize: 是否对样本数量作归一化(即使用样本占比) close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志 输出值: fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value; 上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图。 ''' data = X.copy() legend = True if cols is None: cols = list(data.columns) if feature_cate is None: feature_cate = [] fig_dict = {} Ynew = pd.DataFrame(Y) if isinstance(Y, np.ndarray) and len(Y.shape) == 1: legend = False for i, col in enumerate(cols): if verbose: print(col) if col not in feature_cate: clf = discretize.QuantileDiscretizer( quantiles=[20 * i for i in range(1, 5)], return_numeric=False, fill_na='Missing') data[col] = clf.fit_transform(data[col]) value_count = Ynew.groupby(data[col]).count() if normalize: value_count = value_count / value_count.sum(axis=0) value_count = value_count.reindex( utils.sort(value_count.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) value_mean = Ynew.groupby(data[col]).mean() value_mean = value_mean.reindex( utils.sort(value_mean.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) value_count.plot(kind='bar', rot=30, ax=axes[0], legend=legend) value_mean.plot(kind='bar', rot=30, ax=axes[1], legend=legend) axes[0].set_xlabel('') axes[0].set_ylabel('Count of Samples') axes[0].set_title(col) axes[1].set_xlabel('') axes[1].set_ylabel('Mean of Y') fig_dict[col] = fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return fig_dict
def plot_singleXY_PercentInY(X, cols=None, Y_cont=None, Y_cate=None, feature_cate=None, figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 单一X与单一Y的分析图(Y组内比例)。0-1离散型Y创建数量和Y组内比例柱形图;连续型Y创建数量和Y均值柱形图 输入值: X: 原始数据,dataframe类型 cols: 选取字段,list类型,默认为data的所有列 Y_cont: 连续型Y值,Series或一维np.array Y_cate: 0-1离散型Y值,Series或一维np.array feature_cate: 离散型X变量字段,list类型,默认为空 close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志 输出值: fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value; 离散型Y上面那幅子图为数量柱形图,下面那幅子图为Y组内比例柱形图; 连续型Y上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图 ''' data = X.copy() if cols is None: cols = list(data.columns) if feature_cate is None: feature_cate = [] fig_dict = {} fig_dict = fig_dict.fromkeys(cols) if (Y_cont is None) and (Y_cate is None): raise Exception('Y值未给定!') if (Y_cont is None) and (Y_cate is None): raise Exception('连续型和离散型Y值只能给定一种!') if Y_cate is not None: Y_cate = pd.Series(Y_cate) for i, column in enumerate(cols): if verbose: print(column) if column not in feature_cate: clf = discretize.QuantileDiscretizer( quantiles=[20 * i for i in range(1, 5)], return_numeric=False, fill_na='missing') data[column] = clf.fit_transform(data[column]) count = pd.crosstab(data[column], Y_cate) count.columns.name = '' count.index.name = column count = count.reindex( utils.sort(count.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) ratio = count / count.sum() fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) count.plot(kind='bar', ax=axes[0], rot=0) axes[0].set_xlabel('') axes[0].set_ylabel('Count of Samples') axes[0].set_title(column) axes[0].legend(loc='best') ratio.plot(kind='bar', ax=axes[1], rot=0) axes[1].set_xlabel('') axes[1].set_ylabel('Percent in Category of Y') axes[1].legend(loc='best') if close: plt.close('all') fig_dict[column] = fig else: Y_cont = pd.Series(Y_cont) for i, column in enumerate(cols): if verbose: print(column) if column not in feature_cate: clf = discretize.QuantileDiscretizer( quantiles=[20 * i for i in range(1, 5)], return_numeric=False) data[column] = clf.fit_transform(data[column]) count = Y_cont.groupby(data[column]).count() count.name = '' count = count.reindex( utils.sort(count.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) ratio = Y_cont.groupby(data[column]).mean() ratio.name = '' ratio = ratio.reindex( utils.sort(ratio.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) count.plot(kind='bar', ax=axes[0], rot=0) axes[0].set_xlabel('') axes[0].set_ylabel('Count of Samples') axes[0].set_title(column) axes[0].legend(loc='best') ratio.plot(kind='bar', ax=axes[1], rot=0) axes[1].set_xlabel('') axes[1].set_ylabel('Mean of Y') axes[1].legend(loc='best') if close: plt.close('all') fig_dict[column] = fig if show_last: try: fig fig.show() except: pass return fig_dict
def plot_doubleXY_Mean(X, cols_h=None, cols_v=None, Y_cont=None, Y_cate=None, feature_cate=None, quantiles=None, cuts=None, pattern='\((.*?),', str_nopattern=None, fontsize=12, backend='seaborn', figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 两个变量X与Y(可以多个)的分析图。0-1离散型Y创建1的占比热力图;连续型Y创建均值热力图。本质上都是均值热力图。 输入值: X: 原始数据,dataframe类型 cols_h: 水平轴选取字段,list类型,默认为data的所有列 cols_v: 垂直轴选取字段,list类型,默认为data的所有列 Y_cont: 连续型Y值,Series或一维np.array或DataFrame Y_cate: 0-1离散型Y值(暂时只能支持两类,且数值为0和1),Series或一维np.array或DataFrame feature_cate: 离散型X变量字段,list类型,默认为空 quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[10*i for i in range(1,10)] cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序 str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序 fontsize: int,字体大小 backend: 画图后端,可选{'seaborn','matplotlib'} close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志。 输出值: fig_dict: X~Y关系图字典;键为二元组,第一个元素为水平轴字段名,第二个元素为垂直轴字段名,如('x1','x2');值为热力图对象 ''' data = X.copy() if cols_v is None: cols_v = list(data.columns) if cols_h is None: cols_h = list(data.columns) if feature_cate is None: feature_cate = [] if quantiles is None: quantiles = {} if cuts is None: cuts = {} if str_nopattern is None: str_nopattern = {} for key in quantiles: quantiles[key] = np.sort(np.unique(quantiles[key])).tolist() for key in cuts: cuts[key] = np.sort(np.unique(cuts[key])) q_default = [10 * i for i in range(1, 10)] #先对连续型变量离散化 feature_cont = set( [col for col in cols_v + cols_h if col not in feature_cate]) if len(feature_cont) > 0: for column in feature_cont: clf = discretize.QuantileDiscretizer(quantiles=quantiles.get( column, q_default), return_numeric=False, fill_na='Missing') if column in cuts.keys(): clf.cuts = cuts[column] else: clf.fit(data[column]) data[column] = clf.transform(data[column]) data = data.fillna('Missing') if (Y_cont is None) and (Y_cate is None): raise Exception('Y值未给定!') if (Y_cont is None) and (Y_cate is None): raise Exception('连续型和离散型Y值只能给定一种!') if Y_cate is not None: Y = pd.DataFrame(Y_cate) else: Y = pd.DataFrame(Y_cont) fig_dict = {} n = Y.shape[1] cols_Y = list(Y.columns) cols_Y.sort() for vcol in cols_v: for hcol in cols_h: if verbose: print(vcol, hcol) if (vcol == hcol) or (vcol, hcol) in fig_dict.keys(): continue fig, axes = plt.subplots(n, 1, figsize=figsize) if n == 1: axes = np.array([axes]) for i, col in enumerate(cols_Y): value = Y[col].groupby([data[hcol], data[vcol]]).mean().unstack(hcol) if backend == 'seaborn': value = value.reindex_axis(utils.sort( value.index.tolist(), ascending=False, pattern=pattern, str_nopattern=str_nopattern.get(vcol, None), converter=float), axis=0) else: value = value.reindex_axis(utils.sort( value.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(vcol, None), converter=float), axis=0) value = value.reindex_axis(utils.sort( value.columns.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(hcol, None), converter=float), axis=1) value = value.fillna(0) if i == 0: title = 'Horizontal: %s <---> Vertical: %s\n%s' % ( hcol, vcol, col) else: title = col if backend == 'seaborn': if Y_cate is not None: sns.heatmap(value, ax=axes[i], annot=True, fmt='.2%') else: sns.heatmap(value, ax=axes[i], annot=True, fmt='g') axes[i].set_title(title, fontsize=fontsize) axes[i].set_xlabel('') axes[i].set_ylabel('') else: pc, _ = heatmap(value, ax=axes[i], xlabel='', ylabel='', xticklabels=value.columns, yticklabels=value.index, title=title, fontsize=fontsize) if backend != 'seaborn': plt.colorbar(pc, ax=axes.ravel().tolist()) plt.xticks(rotation=30) plt.yticks(rotation=30) fig_dict[(hcol, vcol)] = fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return fig_dict
def plot_singleXY_Mean(X, Y, cols=None, feature_cate=None, normalize=True, quantiles=None, cuts=None, pattern='\((.*?),', str_nopattern=None, ylabel=None, fontsize=12, figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 单一X与多个Y的分析图(Y均值)。0-1离散型Y创建数量和每个X类别中的1占比柱形图;连续型Y创建数量和Y均值柱形图。本质上都是均值柱形图。 输入值: X: 原始数据,dataframe类型 Y: 连续型或0-1离散型Y值,Series或一维np.array或DataFrame cols: 选取字段,list类型,默认为rawdata的所有列 feature_cate: 离散型X变量字段,list类型,默认为空 normalize: 是否对样本数量作归一化(即使用样本占比) quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[20*i for i in range(1,5)] cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序 str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序 ylabel: 二元列表,表示各个子图的纵轴标签,默认为['Count of Samples','Mean of Y'] fontsize: int,字体大小 close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志 输出值: fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value; 上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图。 ''' data = X.copy() legend = True if str_nopattern is None: str_nopattern = {} if cols is None: cols = list(data.columns) if feature_cate is None: feature_cate = [] if quantiles is None: quantiles = {} if cuts is None: cuts = {} for key in quantiles: quantiles[key] = np.sort(np.unique(quantiles[key])).tolist() for key in cuts: cuts[key] = np.sort(np.unique(cuts[key])) q_default = [20 * i for i in range(1, 5)] fig_dict = {} Ynew = pd.DataFrame(Y) if isinstance(Y, np.ndarray) or len(Y.shape) == 1: legend = False if ylabel is None: ylabel = ['Count of Samples', 'Mean of Y'] for i, col in enumerate(cols): if verbose: print(col) if col not in feature_cate: clf = discretize.QuantileDiscretizer(quantiles=quantiles.get( col, q_default), return_numeric=False, fill_na='Missing') if col in cuts.keys(): clf.cuts = cuts[col] else: clf.fit(data[col]) data[col] = clf.transform(data[col]) data[col] = data[col].fillna('Missing') value_count = Ynew.groupby(data[col]).count() if normalize: value_count = value_count / value_count.sum(axis=0) value_count = value_count.reindex( utils.sort(value_count.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(col, None), converter=float)) value_mean = Ynew.groupby(data[col]).mean() value_mean = value_mean.reindex( utils.sort(value_mean.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(col, None), converter=float)) fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) value_count.plot(kind='bar', rot=30, ax=axes[0], legend=legend, fontsize=fontsize) value_mean.plot(kind='bar', rot=30, ax=axes[1], legend=legend, fontsize=fontsize) axes[0].set_ylabel(ylabel[0], fontsize=fontsize) axes[0].set_title(col, fontsize=fontsize) axes[1].set_xlabel('') axes[1].set_ylabel(ylabel[1], fontsize=fontsize) fig_dict[col] = fig if close: plt.close('all') if show_last: try: fig fig.show() except: pass return fig_dict
def plot_singleXY_PercentInY(X, cols=None, Y_cont=None, Y_cate=None, feature_cate=None, quantiles=None, cuts=None, pattern='\((.*?),', str_nopattern=None, xlabel=None, ylabel=None, color_map=None, legend_map=None, fontsize=12, figsize=(18, 8), close=True, show_last=True, verbose=False): ''' 功能: 单一X与单一Y的分析图(Y组内比例)。0-1离散型Y创建数量和Y组内比例柱形图;连续型Y创建数量和Y均值柱形图 输入值: X: 原始数据,dataframe类型 cols: 选取字段,list类型,默认为data的所有列 Y_cont: 连续型Y值,Series或一维np.array Y_cate: 0-1离散型Y值,Series或一维np.array feature_cate: 离散型X变量字段,list类型,默认为空 quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[20*i for i in range(1,5)] cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序 str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序 xlabel: 字符串,表示纵轴标签 ylabel: 二元列表,表示各个子图的纵轴标签,离散型Y默认为['Count of Samples','Percent in Category of Y'],连续型Y默认为['Count of Samples','Mean of Y'] color_map: 字典,表示离散型Y原始取值对应的柱形图颜色,如{1:'red',0:'blue'},只针对离散型Y。 legend_map: 字典,表示离散型Y原始取值与图例的对应关系,如{1:'bad',0:'good'},只针对离散型Y。 fontsize: int,字体大小 close: 是否关闭生成的图 show_last: 是否展示最后一幅图 verbose: 是否打印日志 输出值: fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value; 离散型Y上面那幅子图为数量柱形图,下面那幅子图为Y组内比例柱形图; 连续型Y上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图 ''' data = X.copy() if cols is None: cols = list(data.columns) if feature_cate is None: feature_cate = [] if quantiles is None: quantiles = {} if cuts is None: cuts = {} if str_nopattern is None: str_nopattern = {} if xlabel is None: xlabel = '' if legend_map is None: legend_map = {} if color_map is None: color_map = {} for key in quantiles: quantiles[key] = np.sort(np.unique(quantiles[key])).tolist() for key in cuts: cuts[key] = np.sort(np.unique(cuts[key])) q_default = [20 * i for i in range(1, 5)] fig_dict = {} fig_dict = fig_dict.fromkeys(cols) if (Y_cont is None) and (Y_cate is None): raise Exception('Y值未给定!') if (Y_cont is None) and (Y_cate is None): raise Exception('连续型和离散型Y值只能给定一种!') if Y_cate is not None: Y_cate = pd.Series(Y_cate) if ylabel is None: ylabel = ['Count of Samples', 'Percent in Category of Y'] for i, column in enumerate(cols): if verbose: print(column) if column not in feature_cate: clf = discretize.QuantileDiscretizer(quantiles=quantiles.get( column, q_default), return_numeric=False, fill_na='Missing') if column in cuts.keys(): clf.cuts = cuts[column] else: clf.fit(data[column]) data[column] = clf.transform(data[column]) data[column] = data[column].fillna('Missing') count = pd.crosstab(data[column], Y_cate) count.columns.name = '' count.index.name = column count = count.reindex( utils.sort(count.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(column, None), converter=float)) color = count.columns.map( lambda xx: color_map.get(xx, None)).tolist() count.columns = count.columns.map( lambda xx: legend_map.get(xx, xx)) ratio = count / count.sum() fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) count.plot(kind='bar', ax=axes[0], rot=0, fontsize=fontsize, color=color) axes[0].set_ylabel(ylabel[0], fontsize=fontsize) axes[0].set_title(column, fontsize=fontsize) axes[0].legend(loc='best', fontsize=fontsize) ratio.plot(kind='bar', ax=axes[1], rot=0, color=color) axes[1].set_xlabel(xlabel, fontsize=fontsize) axes[1].set_ylabel(ylabel[1], fontsize=fontsize) axes[1].legend(loc='best', fontsize=fontsize) if close: plt.close('all') fig_dict[column] = fig else: Y_cont = pd.Series(Y_cont) if ylabel is None: ylabel = ['Count of Samples', 'Mean of Y'] for i, column in enumerate(cols): if verbose: print(column) if column not in feature_cate: clf = discretize.QuantileDiscretizer(quantiles=quantiles.get( column, q_default), return_numeric=False, fill_na='Missing') if column in cuts.keys(): clf.cuts = cuts[column] else: clf.fit(data[column]) data[column] = clf.transform(data[column]) data[column] = data[column].fillna('Missing') count = Y_cont.groupby(data[column]).count() count.name = '' count = count.reindex( utils.sort(count.index.tolist(), ascending=True, pattern=pattern, str_nopattern=str_nopattern.get(column, None), converter=float)) ratio = Y_cont.groupby(data[column]).mean() ratio.name = '' ratio = ratio.reindex( utils.sort(ratio.index.tolist(), ascending=True, pattern='\((.*?),', converter=float)) fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize) count.plot(kind='bar', ax=axes[0], rot=0, fontsize=fontsize) axes[0].set_ylabel(ylabel[0], fontsize=fontsize) axes[0].set_title(column, fontsize=fontsize) axes[0].legend(loc='best', fontsize=fontsize) ratio.plot(kind='bar', ax=axes[1], rot=0, fontsize=fontsize) axes[1].set_xlabel(xlabel, fontsize=fontsize) axes[1].set_ylabel(ylabel[1], fontsize=fontsize) axes[1].legend(loc='best', fontsize=fontsize) if close: plt.close('all') fig_dict[column] = fig if show_last: try: fig fig.show() except: pass return fig_dict