예제 #1
0
 def plot(self,rot=0,close=True,show_last=True,figsize=(18,8)):
     '''
     可视化每个离散变量的连续化值(直方图),可以看离散变量在每个类别中的概率值或WOE分布情况。
     rot:文字旋转角度。
     返回字典:键为变量名(对于单变量,默认为空字符串''),值为图片对象。
     '''
     result={}
     if self.single is True:
         data=pd.Series(self.maps)
         data.index.name=''
         data.name=''
         data=data.reindex(utils.sort(data.index.tolist(),ascending=True,pattern='\((.*?),',converter=float))
         fig=plt.figure(figsize=figsize)
         ax=fig.add_subplot(111)
         data.plot(kind='bar',rot=rot,ax=ax)
         ax.set_title(self.method)
         result['']=fig
         if close:
             plt.close('all')
     else:
         for feature in self.maps:
             data=pd.Series(self.maps[feature])
             data.index.name=''
             data.name=''
             data=data.reindex(utils.sort(data.index.tolist(),ascending=True,pattern='\((.*?),',converter=float))
             fig=plt.figure(figsize=figsize)
             ax=fig.add_subplot(111)
             data.plot(kind='bar',rot=rot,ax=ax)
             ax.set_title('%s : %s'%(self.method,feature))
             result[feature]=fig
             if close:
                 plt.close('all')
     if show_last:
         try:
             fig
             fig.show()
         except:
             pass
     return result
예제 #2
0
 def plot(self,
          rot=0,
          pattern='\((.*?),',
          str_nopattern=None,
          close=True,
          show_last=True,
          figsize=(18, 8),
          xlabel='Discrete Values',
          ylabel='Continuous Values',
          fontsize=12):
     '''
     可视化每个离散变量的连续化值(直方图),可以看离散变量在每个类别中的概率值或WOE分布情况。
     rot:文字旋转角度。
     pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序。
     str_nopattern: 若self.single为True,则为列表,表示未匹配pattern字符串的正常顺序;
                    若self.single为True,则为字典,键为变量名(或变量位置),值为列表,含义同上。
     返回字典:键为变量名(对于单变量,默认为空字符串''),值为图片对象。
     '''
     result = {}
     if self.single is True:
         data = pd.Series(self.maps)
         data.index.name = ''
         data.name = ''
         data = data.reindex(
             utils.sort(data.index.tolist(),
                        ascending=True,
                        pattern=pattern,
                        str_nopattern=str_nopattern,
                        converter=float))
         fig = plt.figure(figsize=figsize)
         ax = fig.add_subplot(111)
         data.plot(kind='bar', rot=rot, ax=ax, fontsize=fontsize)
         ax.set_xlabel(xlabel, fontsize=fontsize)
         ax.set_ylabel(ylabel, fontsize=fontsize)
         ax.set_title(self.method, fontsize=fontsize)
         result[''] = fig
         if close:
             plt.close('all')
     else:
         if str_nopattern is None:
             str_nopattern = {}
         for feature in self.maps:
             data = pd.Series(self.maps[feature])
             data.index.name = ''
             data.name = ''
             data = data.reindex(
                 utils.sort(data.index.tolist(),
                            ascending=True,
                            pattern=pattern,
                            str_nopattern=str_nopattern.get(feature, None),
                            converter=float))
             fig = plt.figure(figsize=figsize)
             ax = fig.add_subplot(111)
             data.plot(kind='bar', rot=rot, ax=ax, fontsize=fontsize)
             ax.set_xlabel(xlabel, fontsize=fontsize)
             ax.set_ylabel(ylabel, fontsize=fontsize)
             ax.set_title('%s : %s' % (self.method, feature),
                          fontsize=fontsize)
             result[feature] = fig
             if close:
                 plt.close('all')
     if show_last:
         try:
             fig
             fig.show()
         except:
             pass
     return result
예제 #3
0
def plot_doubleXY_Mean(X,
                       cols_h=None,
                       cols_v=None,
                       Y_cont=None,
                       Y_cate=None,
                       feature_cate=None,
                       backend='seaborn',
                       figsize=(18, 8),
                       close=True,
                       show_last=True,
                       verbose=False):
    '''
    功能: 两个变量X与Y(可以多个)的分析图。0-1离散型Y创建1的占比热力图;连续型Y创建均值热力图。本质上都是均值热力图。
    输入值: 
    X: 原始数据,dataframe类型
    cols_h: 水平轴选取字段,list类型,默认为data的所有列
    cols_v: 垂直轴选取字段,list类型,默认为data的所有列
    Y_cont: 连续型Y值,Series或一维np.array或DataFrame
    Y_cate: 0-1离散型Y值(暂时只能支持两类,且数值为0和1),Series或一维np.array或DataFrame
    feature_cate: 离散型X变量字段,list类型,默认为空
    backend: 画图后端,可选{'seaborn','matplotlib'}
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志。
    输出值: 
    fig_dict: X~Y关系图字典;键为二元组,第一个元素为水平轴字段名,第二个元素为垂直轴字段名,如('x1','x2');值为热力图对象
    '''
    data = X.copy()
    if cols_v is None:
        cols_v = list(data.columns)
    if cols_h is None:
        cols_h = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    #先对连续型变量离散化
    feature_cont = [col for col in cols_v + cols_h if col not in feature_cate]
    clf = discretize.QuantileDiscretizer(
        feature_names=feature_cont,
        quantiles=[10 * i for i in range(1, 10)],
        return_numeric=False,
        fill_na='missing')
    data = clf.fit_transform(data)

    if (Y_cont is None) and (Y_cate is None):
        raise Exception('Y值未给定!')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('连续型和离散型Y值只能给定一种!')
    if Y_cate is not None:
        Y = pd.DataFrame(Y_cate)
    else:
        Y = pd.DataFrame(Y_cont)
    fig_dict = {}
    n = Y.shape[1]
    cols_Y = list(Y.columns)
    cols_Y.sort()
    for vcol in cols_v:
        for hcol in cols_h:
            if verbose:
                print(vcol, hcol)
            if (vcol == hcol) or (vcol, hcol) in fig_dict.keys():
                continue
            fig, axes = plt.subplots(n, 1, figsize=figsize)
            if n == 1:
                axes = np.array([axes])
            for i, col in enumerate(cols_Y):
                value = Y[col].groupby([data[hcol],
                                        data[vcol]]).mean().unstack(hcol)
                if backend == 'seaborn':
                    value = value.reindex_axis(utils.sort(value.index.tolist(),
                                                          ascending=False,
                                                          pattern='\((.*?),',
                                                          converter=float),
                                               axis=0)
                else:
                    value = value.reindex_axis(utils.sort(value.index.tolist(),
                                                          ascending=True,
                                                          pattern='\((.*?),',
                                                          converter=float),
                                               axis=0)
                value = value.reindex_axis(utils.sort(value.columns.tolist(),
                                                      ascending=True,
                                                      pattern='\((.*?),',
                                                      converter=float),
                                           axis=1)
                value = value.fillna(0)
                if i == 0:
                    title = 'Horizontal: %s <---> Vertical: %s\n%s' % (
                        hcol, vcol, col)
                else:
                    title = col
                if backend == 'seaborn':
                    if Y_cate is not None:
                        sns.heatmap(value, ax=axes[i], annot=True, fmt='.2%')
                    else:
                        sns.heatmap(value, ax=axes[i], annot=True, fmt='g')
                    axes[i].set_title(title)
                    axes[i].set_xlabel('')
                    axes[i].set_ylabel('')
                else:
                    pc, _ = heatmap(value,
                                    ax=axes[i],
                                    xlabel='',
                                    ylabel='',
                                    xticklabels=value.columns,
                                    yticklabels=value.index,
                                    title=title)
            if backend != 'seaborn':
                plt.colorbar(pc, ax=axes.ravel().tolist())
            plt.xticks(rotation=30)
            plt.yticks(rotation=30)
            fig_dict[(hcol, vcol)] = fig
            if close:
                plt.close('all')
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass
    return fig_dict
예제 #4
0
def plot_singleXY_Mean(X,
                       Y,
                       cols=None,
                       feature_cate=None,
                       normalize=True,
                       figsize=(18, 8),
                       close=True,
                       show_last=True,
                       verbose=False):
    '''
    功能: 单一X与多个Y的分析图(Y均值)。0-1离散型Y创建数量和每个X类别中的1占比柱形图;连续型Y创建数量和Y均值柱形图。本质上都是均值柱形图。
    输入值: 
    X: 原始数据,dataframe类型
    Y: 连续型或0-1离散型Y值,Series或一维np.array或DataFrame
    cols: 选取字段,list类型,默认为rawdata的所有列
    feature_cate: 离散型X变量字段,list类型,默认为空
    normalize: 是否对样本数量作归一化(即使用样本占比)
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志
    输出值: 
    fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value;
              上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图。
    '''
    data = X.copy()
    legend = True
    if cols is None:
        cols = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    fig_dict = {}
    Ynew = pd.DataFrame(Y)
    if isinstance(Y, np.ndarray) and len(Y.shape) == 1:
        legend = False
    for i, col in enumerate(cols):
        if verbose:
            print(col)
        if col not in feature_cate:
            clf = discretize.QuantileDiscretizer(
                quantiles=[20 * i for i in range(1, 5)],
                return_numeric=False,
                fill_na='Missing')
            data[col] = clf.fit_transform(data[col])
        value_count = Ynew.groupby(data[col]).count()
        if normalize:
            value_count = value_count / value_count.sum(axis=0)
        value_count = value_count.reindex(
            utils.sort(value_count.index.tolist(),
                       ascending=True,
                       pattern='\((.*?),',
                       converter=float))
        value_mean = Ynew.groupby(data[col]).mean()
        value_mean = value_mean.reindex(
            utils.sort(value_mean.index.tolist(),
                       ascending=True,
                       pattern='\((.*?),',
                       converter=float))

        fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
        value_count.plot(kind='bar', rot=30, ax=axes[0], legend=legend)
        value_mean.plot(kind='bar', rot=30, ax=axes[1], legend=legend)
        axes[0].set_xlabel('')
        axes[0].set_ylabel('Count of Samples')
        axes[0].set_title(col)

        axes[1].set_xlabel('')
        axes[1].set_ylabel('Mean of Y')

        fig_dict[col] = fig
        if close:
            plt.close('all')
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass

    return fig_dict
예제 #5
0
def plot_singleXY_PercentInY(X,
                             cols=None,
                             Y_cont=None,
                             Y_cate=None,
                             feature_cate=None,
                             figsize=(18, 8),
                             close=True,
                             show_last=True,
                             verbose=False):
    '''
    功能: 单一X与单一Y的分析图(Y组内比例)。0-1离散型Y创建数量和Y组内比例柱形图;连续型Y创建数量和Y均值柱形图
    输入值: 
    X: 原始数据,dataframe类型
    cols: 选取字段,list类型,默认为data的所有列
    Y_cont: 连续型Y值,Series或一维np.array
    Y_cate: 0-1离散型Y值,Series或一维np.array
    feature_cate: 离散型X变量字段,list类型,默认为空
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志
    输出值: 
    fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value;
              离散型Y上面那幅子图为数量柱形图,下面那幅子图为Y组内比例柱形图;
              连续型Y上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图
    '''
    data = X.copy()
    if cols is None:
        cols = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    fig_dict = {}
    fig_dict = fig_dict.fromkeys(cols)
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('Y值未给定!')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('连续型和离散型Y值只能给定一种!')
    if Y_cate is not None:
        Y_cate = pd.Series(Y_cate)
        for i, column in enumerate(cols):
            if verbose:
                print(column)
            if column not in feature_cate:
                clf = discretize.QuantileDiscretizer(
                    quantiles=[20 * i for i in range(1, 5)],
                    return_numeric=False,
                    fill_na='missing')
                data[column] = clf.fit_transform(data[column])
            count = pd.crosstab(data[column], Y_cate)
            count.columns.name = ''
            count.index.name = column
            count = count.reindex(
                utils.sort(count.index.tolist(),
                           ascending=True,
                           pattern='\((.*?),',
                           converter=float))
            ratio = count / count.sum()

            fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
            count.plot(kind='bar', ax=axes[0], rot=0)
            axes[0].set_xlabel('')
            axes[0].set_ylabel('Count of Samples')
            axes[0].set_title(column)
            axes[0].legend(loc='best')

            ratio.plot(kind='bar', ax=axes[1], rot=0)
            axes[1].set_xlabel('')
            axes[1].set_ylabel('Percent in Category of Y')
            axes[1].legend(loc='best')

            if close:
                plt.close('all')
            fig_dict[column] = fig
    else:
        Y_cont = pd.Series(Y_cont)
        for i, column in enumerate(cols):
            if verbose:
                print(column)
            if column not in feature_cate:
                clf = discretize.QuantileDiscretizer(
                    quantiles=[20 * i for i in range(1, 5)],
                    return_numeric=False)
                data[column] = clf.fit_transform(data[column])
            count = Y_cont.groupby(data[column]).count()
            count.name = ''
            count = count.reindex(
                utils.sort(count.index.tolist(),
                           ascending=True,
                           pattern='\((.*?),',
                           converter=float))

            ratio = Y_cont.groupby(data[column]).mean()
            ratio.name = ''
            ratio = ratio.reindex(
                utils.sort(ratio.index.tolist(),
                           ascending=True,
                           pattern='\((.*?),',
                           converter=float))

            fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
            count.plot(kind='bar', ax=axes[0], rot=0)
            axes[0].set_xlabel('')
            axes[0].set_ylabel('Count of Samples')
            axes[0].set_title(column)
            axes[0].legend(loc='best')

            ratio.plot(kind='bar', ax=axes[1], rot=0)
            axes[1].set_xlabel('')
            axes[1].set_ylabel('Mean of Y')
            axes[1].legend(loc='best')

            if close:
                plt.close('all')
            fig_dict[column] = fig
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass
    return fig_dict
예제 #6
0
def plot_doubleXY_Mean(X,
                       cols_h=None,
                       cols_v=None,
                       Y_cont=None,
                       Y_cate=None,
                       feature_cate=None,
                       quantiles=None,
                       cuts=None,
                       pattern='\((.*?),',
                       str_nopattern=None,
                       fontsize=12,
                       backend='seaborn',
                       figsize=(18, 8),
                       close=True,
                       show_last=True,
                       verbose=False):
    '''
    功能: 两个变量X与Y(可以多个)的分析图。0-1离散型Y创建1的占比热力图;连续型Y创建均值热力图。本质上都是均值热力图。
    输入值: 
    X: 原始数据,dataframe类型
    cols_h: 水平轴选取字段,list类型,默认为data的所有列
    cols_v: 垂直轴选取字段,list类型,默认为data的所有列
    Y_cont: 连续型Y值,Series或一维np.array或DataFrame
    Y_cate: 0-1离散型Y值(暂时只能支持两类,且数值为0和1),Series或一维np.array或DataFrame
    feature_cate: 离散型X变量字段,list类型,默认为空
    quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[10*i for i in range(1,10)]
    cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles
    pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序
    str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序
    fontsize: int,字体大小
    backend: 画图后端,可选{'seaborn','matplotlib'}
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志。
    输出值: 
    fig_dict: X~Y关系图字典;键为二元组,第一个元素为水平轴字段名,第二个元素为垂直轴字段名,如('x1','x2');值为热力图对象
    '''
    data = X.copy()
    if cols_v is None:
        cols_v = list(data.columns)
    if cols_h is None:
        cols_h = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    if quantiles is None:
        quantiles = {}
    if cuts is None:
        cuts = {}
    if str_nopattern is None:
        str_nopattern = {}
    for key in quantiles:
        quantiles[key] = np.sort(np.unique(quantiles[key])).tolist()
    for key in cuts:
        cuts[key] = np.sort(np.unique(cuts[key]))
    q_default = [10 * i for i in range(1, 10)]
    #先对连续型变量离散化
    feature_cont = set(
        [col for col in cols_v + cols_h if col not in feature_cate])
    if len(feature_cont) > 0:
        for column in feature_cont:
            clf = discretize.QuantileDiscretizer(quantiles=quantiles.get(
                column, q_default),
                                                 return_numeric=False,
                                                 fill_na='Missing')
            if column in cuts.keys():
                clf.cuts = cuts[column]
            else:
                clf.fit(data[column])
            data[column] = clf.transform(data[column])
    data = data.fillna('Missing')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('Y值未给定!')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('连续型和离散型Y值只能给定一种!')
    if Y_cate is not None:
        Y = pd.DataFrame(Y_cate)
    else:
        Y = pd.DataFrame(Y_cont)
    fig_dict = {}
    n = Y.shape[1]
    cols_Y = list(Y.columns)
    cols_Y.sort()
    for vcol in cols_v:
        for hcol in cols_h:
            if verbose:
                print(vcol, hcol)
            if (vcol == hcol) or (vcol, hcol) in fig_dict.keys():
                continue
            fig, axes = plt.subplots(n, 1, figsize=figsize)
            if n == 1:
                axes = np.array([axes])
            for i, col in enumerate(cols_Y):
                value = Y[col].groupby([data[hcol],
                                        data[vcol]]).mean().unstack(hcol)
                if backend == 'seaborn':
                    value = value.reindex_axis(utils.sort(
                        value.index.tolist(),
                        ascending=False,
                        pattern=pattern,
                        str_nopattern=str_nopattern.get(vcol, None),
                        converter=float),
                                               axis=0)
                else:
                    value = value.reindex_axis(utils.sort(
                        value.index.tolist(),
                        ascending=True,
                        pattern=pattern,
                        str_nopattern=str_nopattern.get(vcol, None),
                        converter=float),
                                               axis=0)
                value = value.reindex_axis(utils.sort(
                    value.columns.tolist(),
                    ascending=True,
                    pattern=pattern,
                    str_nopattern=str_nopattern.get(hcol, None),
                    converter=float),
                                           axis=1)
                value = value.fillna(0)
                if i == 0:
                    title = 'Horizontal: %s <---> Vertical: %s\n%s' % (
                        hcol, vcol, col)
                else:
                    title = col
                if backend == 'seaborn':
                    if Y_cate is not None:
                        sns.heatmap(value, ax=axes[i], annot=True, fmt='.2%')
                    else:
                        sns.heatmap(value, ax=axes[i], annot=True, fmt='g')
                    axes[i].set_title(title, fontsize=fontsize)
                    axes[i].set_xlabel('')
                    axes[i].set_ylabel('')
                else:
                    pc, _ = heatmap(value,
                                    ax=axes[i],
                                    xlabel='',
                                    ylabel='',
                                    xticklabels=value.columns,
                                    yticklabels=value.index,
                                    title=title,
                                    fontsize=fontsize)
            if backend != 'seaborn':
                plt.colorbar(pc, ax=axes.ravel().tolist())
            plt.xticks(rotation=30)
            plt.yticks(rotation=30)
            fig_dict[(hcol, vcol)] = fig
            if close:
                plt.close('all')
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass
    return fig_dict
예제 #7
0
def plot_singleXY_Mean(X,
                       Y,
                       cols=None,
                       feature_cate=None,
                       normalize=True,
                       quantiles=None,
                       cuts=None,
                       pattern='\((.*?),',
                       str_nopattern=None,
                       ylabel=None,
                       fontsize=12,
                       figsize=(18, 8),
                       close=True,
                       show_last=True,
                       verbose=False):
    '''
    功能: 单一X与多个Y的分析图(Y均值)。0-1离散型Y创建数量和每个X类别中的1占比柱形图;连续型Y创建数量和Y均值柱形图。本质上都是均值柱形图。
    输入值: 
    X: 原始数据,dataframe类型
    Y: 连续型或0-1离散型Y值,Series或一维np.array或DataFrame
    cols: 选取字段,list类型,默认为rawdata的所有列
    feature_cate: 离散型X变量字段,list类型,默认为空
    normalize: 是否对样本数量作归一化(即使用样本占比)
    quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[20*i for i in range(1,5)]
    cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles
    pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序
    str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序
    ylabel: 二元列表,表示各个子图的纵轴标签,默认为['Count of Samples','Mean of Y']
    fontsize: int,字体大小
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志
    输出值: 
    fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value;
              上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图。
    '''
    data = X.copy()
    legend = True
    if str_nopattern is None:
        str_nopattern = {}
    if cols is None:
        cols = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    if quantiles is None:
        quantiles = {}
    if cuts is None:
        cuts = {}
    for key in quantiles:
        quantiles[key] = np.sort(np.unique(quantiles[key])).tolist()
    for key in cuts:
        cuts[key] = np.sort(np.unique(cuts[key]))
    q_default = [20 * i for i in range(1, 5)]
    fig_dict = {}
    Ynew = pd.DataFrame(Y)
    if isinstance(Y, np.ndarray) or len(Y.shape) == 1:
        legend = False
    if ylabel is None:
        ylabel = ['Count of Samples', 'Mean of Y']
    for i, col in enumerate(cols):
        if verbose:
            print(col)
        if col not in feature_cate:
            clf = discretize.QuantileDiscretizer(quantiles=quantiles.get(
                col, q_default),
                                                 return_numeric=False,
                                                 fill_na='Missing')
            if col in cuts.keys():
                clf.cuts = cuts[col]
            else:
                clf.fit(data[col])
            data[col] = clf.transform(data[col])
        data[col] = data[col].fillna('Missing')
        value_count = Ynew.groupby(data[col]).count()
        if normalize:
            value_count = value_count / value_count.sum(axis=0)
        value_count = value_count.reindex(
            utils.sort(value_count.index.tolist(),
                       ascending=True,
                       pattern=pattern,
                       str_nopattern=str_nopattern.get(col, None),
                       converter=float))
        value_mean = Ynew.groupby(data[col]).mean()
        value_mean = value_mean.reindex(
            utils.sort(value_mean.index.tolist(),
                       ascending=True,
                       pattern=pattern,
                       str_nopattern=str_nopattern.get(col, None),
                       converter=float))

        fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
        value_count.plot(kind='bar',
                         rot=30,
                         ax=axes[0],
                         legend=legend,
                         fontsize=fontsize)
        value_mean.plot(kind='bar',
                        rot=30,
                        ax=axes[1],
                        legend=legend,
                        fontsize=fontsize)
        axes[0].set_ylabel(ylabel[0], fontsize=fontsize)
        axes[0].set_title(col, fontsize=fontsize)

        axes[1].set_xlabel('')
        axes[1].set_ylabel(ylabel[1], fontsize=fontsize)

        fig_dict[col] = fig
        if close:
            plt.close('all')
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass

    return fig_dict
예제 #8
0
def plot_singleXY_PercentInY(X,
                             cols=None,
                             Y_cont=None,
                             Y_cate=None,
                             feature_cate=None,
                             quantiles=None,
                             cuts=None,
                             pattern='\((.*?),',
                             str_nopattern=None,
                             xlabel=None,
                             ylabel=None,
                             color_map=None,
                             legend_map=None,
                             fontsize=12,
                             figsize=(18, 8),
                             close=True,
                             show_last=True,
                             verbose=False):
    '''
    功能: 单一X与单一Y的分析图(Y组内比例)。0-1离散型Y创建数量和Y组内比例柱形图;连续型Y创建数量和Y均值柱形图
    输入值: 
    X: 原始数据,dataframe类型
    cols: 选取字段,list类型,默认为data的所有列
    Y_cont: 连续型Y值,Series或一维np.array
    Y_cate: 0-1离散型Y值,Series或一维np.array
    feature_cate: 离散型X变量字段,list类型,默认为空
    quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[20*i for i in range(1,5)]
    cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles
    pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序
    str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序
    xlabel: 字符串,表示纵轴标签
    ylabel: 二元列表,表示各个子图的纵轴标签,离散型Y默认为['Count of Samples','Percent in Category of Y'],连续型Y默认为['Count of Samples','Mean of Y']
    color_map: 字典,表示离散型Y原始取值对应的柱形图颜色,如{1:'red',0:'blue'},只针对离散型Y。
    legend_map: 字典,表示离散型Y原始取值与图例的对应关系,如{1:'bad',0:'good'},只针对离散型Y。
    fontsize: int,字体大小
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志
    输出值: 
    fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value;
              离散型Y上面那幅子图为数量柱形图,下面那幅子图为Y组内比例柱形图;
              连续型Y上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图
    '''
    data = X.copy()
    if cols is None:
        cols = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    if quantiles is None:
        quantiles = {}
    if cuts is None:
        cuts = {}
    if str_nopattern is None:
        str_nopattern = {}
    if xlabel is None:
        xlabel = ''
    if legend_map is None:
        legend_map = {}
    if color_map is None:
        color_map = {}
    for key in quantiles:
        quantiles[key] = np.sort(np.unique(quantiles[key])).tolist()
    for key in cuts:
        cuts[key] = np.sort(np.unique(cuts[key]))
    q_default = [20 * i for i in range(1, 5)]
    fig_dict = {}
    fig_dict = fig_dict.fromkeys(cols)
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('Y值未给定!')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('连续型和离散型Y值只能给定一种!')
    if Y_cate is not None:
        Y_cate = pd.Series(Y_cate)
        if ylabel is None:
            ylabel = ['Count of Samples', 'Percent in Category of Y']
        for i, column in enumerate(cols):
            if verbose:
                print(column)
            if column not in feature_cate:
                clf = discretize.QuantileDiscretizer(quantiles=quantiles.get(
                    column, q_default),
                                                     return_numeric=False,
                                                     fill_na='Missing')
                if column in cuts.keys():
                    clf.cuts = cuts[column]
                else:
                    clf.fit(data[column])
                data[column] = clf.transform(data[column])
            data[column] = data[column].fillna('Missing')
            count = pd.crosstab(data[column], Y_cate)
            count.columns.name = ''
            count.index.name = column
            count = count.reindex(
                utils.sort(count.index.tolist(),
                           ascending=True,
                           pattern=pattern,
                           str_nopattern=str_nopattern.get(column, None),
                           converter=float))
            color = count.columns.map(
                lambda xx: color_map.get(xx, None)).tolist()
            count.columns = count.columns.map(
                lambda xx: legend_map.get(xx, xx))
            ratio = count / count.sum()

            fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
            count.plot(kind='bar',
                       ax=axes[0],
                       rot=0,
                       fontsize=fontsize,
                       color=color)
            axes[0].set_ylabel(ylabel[0], fontsize=fontsize)
            axes[0].set_title(column, fontsize=fontsize)
            axes[0].legend(loc='best', fontsize=fontsize)

            ratio.plot(kind='bar', ax=axes[1], rot=0, color=color)
            axes[1].set_xlabel(xlabel, fontsize=fontsize)
            axes[1].set_ylabel(ylabel[1], fontsize=fontsize)
            axes[1].legend(loc='best', fontsize=fontsize)

            if close:
                plt.close('all')
            fig_dict[column] = fig
    else:
        Y_cont = pd.Series(Y_cont)
        if ylabel is None:
            ylabel = ['Count of Samples', 'Mean of Y']
        for i, column in enumerate(cols):
            if verbose:
                print(column)
            if column not in feature_cate:
                clf = discretize.QuantileDiscretizer(quantiles=quantiles.get(
                    column, q_default),
                                                     return_numeric=False,
                                                     fill_na='Missing')
                if column in cuts.keys():
                    clf.cuts = cuts[column]
                else:
                    clf.fit(data[column])
                data[column] = clf.transform(data[column])
            data[column] = data[column].fillna('Missing')
            count = Y_cont.groupby(data[column]).count()
            count.name = ''
            count = count.reindex(
                utils.sort(count.index.tolist(),
                           ascending=True,
                           pattern=pattern,
                           str_nopattern=str_nopattern.get(column, None),
                           converter=float))

            ratio = Y_cont.groupby(data[column]).mean()
            ratio.name = ''
            ratio = ratio.reindex(
                utils.sort(ratio.index.tolist(),
                           ascending=True,
                           pattern='\((.*?),',
                           converter=float))

            fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
            count.plot(kind='bar', ax=axes[0], rot=0, fontsize=fontsize)
            axes[0].set_ylabel(ylabel[0], fontsize=fontsize)
            axes[0].set_title(column, fontsize=fontsize)
            axes[0].legend(loc='best', fontsize=fontsize)

            ratio.plot(kind='bar', ax=axes[1], rot=0, fontsize=fontsize)
            axes[1].set_xlabel(xlabel, fontsize=fontsize)
            axes[1].set_ylabel(ylabel[1], fontsize=fontsize)
            axes[1].legend(loc='best', fontsize=fontsize)

            if close:
                plt.close('all')
            fig_dict[column] = fig
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass
    return fig_dict