Пример #1
0
def plot_doubleXY_Mean(X,
                       cols_h=None,
                       cols_v=None,
                       Y_cont=None,
                       Y_cate=None,
                       feature_cate=None,
                       backend='seaborn',
                       figsize=(18, 8),
                       close=True,
                       show_last=True,
                       verbose=False):
    '''
    功能: 两个变量X与Y(可以多个)的分析图。0-1离散型Y创建1的占比热力图;连续型Y创建均值热力图。本质上都是均值热力图。
    输入值: 
    X: 原始数据,dataframe类型
    cols_h: 水平轴选取字段,list类型,默认为data的所有列
    cols_v: 垂直轴选取字段,list类型,默认为data的所有列
    Y_cont: 连续型Y值,Series或一维np.array或DataFrame
    Y_cate: 0-1离散型Y值(暂时只能支持两类,且数值为0和1),Series或一维np.array或DataFrame
    feature_cate: 离散型X变量字段,list类型,默认为空
    backend: 画图后端,可选{'seaborn','matplotlib'}
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志。
    输出值: 
    fig_dict: X~Y关系图字典;键为二元组,第一个元素为水平轴字段名,第二个元素为垂直轴字段名,如('x1','x2');值为热力图对象
    '''
    data = X.copy()
    if cols_v is None:
        cols_v = list(data.columns)
    if cols_h is None:
        cols_h = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    #先对连续型变量离散化
    feature_cont = [col for col in cols_v + cols_h if col not in feature_cate]
    clf = discretize.QuantileDiscretizer(
        feature_names=feature_cont,
        quantiles=[10 * i for i in range(1, 10)],
        return_numeric=False,
        fill_na='missing')
    data = clf.fit_transform(data)

    if (Y_cont is None) and (Y_cate is None):
        raise Exception('Y值未给定!')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('连续型和离散型Y值只能给定一种!')
    if Y_cate is not None:
        Y = pd.DataFrame(Y_cate)
    else:
        Y = pd.DataFrame(Y_cont)
    fig_dict = {}
    n = Y.shape[1]
    cols_Y = list(Y.columns)
    cols_Y.sort()
    for vcol in cols_v:
        for hcol in cols_h:
            if verbose:
                print(vcol, hcol)
            if (vcol == hcol) or (vcol, hcol) in fig_dict.keys():
                continue
            fig, axes = plt.subplots(n, 1, figsize=figsize)
            if n == 1:
                axes = np.array([axes])
            for i, col in enumerate(cols_Y):
                value = Y[col].groupby([data[hcol],
                                        data[vcol]]).mean().unstack(hcol)
                if backend == 'seaborn':
                    value = value.reindex_axis(utils.sort(value.index.tolist(),
                                                          ascending=False,
                                                          pattern='\((.*?),',
                                                          converter=float),
                                               axis=0)
                else:
                    value = value.reindex_axis(utils.sort(value.index.tolist(),
                                                          ascending=True,
                                                          pattern='\((.*?),',
                                                          converter=float),
                                               axis=0)
                value = value.reindex_axis(utils.sort(value.columns.tolist(),
                                                      ascending=True,
                                                      pattern='\((.*?),',
                                                      converter=float),
                                           axis=1)
                value = value.fillna(0)
                if i == 0:
                    title = 'Horizontal: %s <---> Vertical: %s\n%s' % (
                        hcol, vcol, col)
                else:
                    title = col
                if backend == 'seaborn':
                    if Y_cate is not None:
                        sns.heatmap(value, ax=axes[i], annot=True, fmt='.2%')
                    else:
                        sns.heatmap(value, ax=axes[i], annot=True, fmt='g')
                    axes[i].set_title(title)
                    axes[i].set_xlabel('')
                    axes[i].set_ylabel('')
                else:
                    pc, _ = heatmap(value,
                                    ax=axes[i],
                                    xlabel='',
                                    ylabel='',
                                    xticklabels=value.columns,
                                    yticklabels=value.index,
                                    title=title)
            if backend != 'seaborn':
                plt.colorbar(pc, ax=axes.ravel().tolist())
            plt.xticks(rotation=30)
            plt.yticks(rotation=30)
            fig_dict[(hcol, vcol)] = fig
            if close:
                plt.close('all')
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass
    return fig_dict
Пример #2
0
def plot_singleXY_PercentInY(X,
                             cols=None,
                             Y_cont=None,
                             Y_cate=None,
                             feature_cate=None,
                             figsize=(18, 8),
                             close=True,
                             show_last=True,
                             verbose=False):
    '''
    功能: 单一X与单一Y的分析图(Y组内比例)。0-1离散型Y创建数量和Y组内比例柱形图;连续型Y创建数量和Y均值柱形图
    输入值: 
    X: 原始数据,dataframe类型
    cols: 选取字段,list类型,默认为data的所有列
    Y_cont: 连续型Y值,Series或一维np.array
    Y_cate: 0-1离散型Y值,Series或一维np.array
    feature_cate: 离散型X变量字段,list类型,默认为空
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志
    输出值: 
    fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value;
              离散型Y上面那幅子图为数量柱形图,下面那幅子图为Y组内比例柱形图;
              连续型Y上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图
    '''
    data = X.copy()
    if cols is None:
        cols = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    fig_dict = {}
    fig_dict = fig_dict.fromkeys(cols)
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('Y值未给定!')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('连续型和离散型Y值只能给定一种!')
    if Y_cate is not None:
        Y_cate = pd.Series(Y_cate)
        for i, column in enumerate(cols):
            if verbose:
                print(column)
            if column not in feature_cate:
                clf = discretize.QuantileDiscretizer(
                    quantiles=[20 * i for i in range(1, 5)],
                    return_numeric=False,
                    fill_na='missing')
                data[column] = clf.fit_transform(data[column])
            count = pd.crosstab(data[column], Y_cate)
            count.columns.name = ''
            count.index.name = column
            count = count.reindex(
                utils.sort(count.index.tolist(),
                           ascending=True,
                           pattern='\((.*?),',
                           converter=float))
            ratio = count / count.sum()

            fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
            count.plot(kind='bar', ax=axes[0], rot=0)
            axes[0].set_xlabel('')
            axes[0].set_ylabel('Count of Samples')
            axes[0].set_title(column)
            axes[0].legend(loc='best')

            ratio.plot(kind='bar', ax=axes[1], rot=0)
            axes[1].set_xlabel('')
            axes[1].set_ylabel('Percent in Category of Y')
            axes[1].legend(loc='best')

            if close:
                plt.close('all')
            fig_dict[column] = fig
    else:
        Y_cont = pd.Series(Y_cont)
        for i, column in enumerate(cols):
            if verbose:
                print(column)
            if column not in feature_cate:
                clf = discretize.QuantileDiscretizer(
                    quantiles=[20 * i for i in range(1, 5)],
                    return_numeric=False)
                data[column] = clf.fit_transform(data[column])
            count = Y_cont.groupby(data[column]).count()
            count.name = ''
            count = count.reindex(
                utils.sort(count.index.tolist(),
                           ascending=True,
                           pattern='\((.*?),',
                           converter=float))

            ratio = Y_cont.groupby(data[column]).mean()
            ratio.name = ''
            ratio = ratio.reindex(
                utils.sort(ratio.index.tolist(),
                           ascending=True,
                           pattern='\((.*?),',
                           converter=float))

            fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
            count.plot(kind='bar', ax=axes[0], rot=0)
            axes[0].set_xlabel('')
            axes[0].set_ylabel('Count of Samples')
            axes[0].set_title(column)
            axes[0].legend(loc='best')

            ratio.plot(kind='bar', ax=axes[1], rot=0)
            axes[1].set_xlabel('')
            axes[1].set_ylabel('Mean of Y')
            axes[1].legend(loc='best')

            if close:
                plt.close('all')
            fig_dict[column] = fig
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass
    return fig_dict
Пример #3
0
def plot_singleXY_Mean(X,
                       Y,
                       cols=None,
                       feature_cate=None,
                       normalize=True,
                       figsize=(18, 8),
                       close=True,
                       show_last=True,
                       verbose=False):
    '''
    功能: 单一X与多个Y的分析图(Y均值)。0-1离散型Y创建数量和每个X类别中的1占比柱形图;连续型Y创建数量和Y均值柱形图。本质上都是均值柱形图。
    输入值: 
    X: 原始数据,dataframe类型
    Y: 连续型或0-1离散型Y值,Series或一维np.array或DataFrame
    cols: 选取字段,list类型,默认为rawdata的所有列
    feature_cate: 离散型X变量字段,list类型,默认为空
    normalize: 是否对样本数量作归一化(即使用样本占比)
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志
    输出值: 
    fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value;
              上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图。
    '''
    data = X.copy()
    legend = True
    if cols is None:
        cols = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    fig_dict = {}
    Ynew = pd.DataFrame(Y)
    if isinstance(Y, np.ndarray) and len(Y.shape) == 1:
        legend = False
    for i, col in enumerate(cols):
        if verbose:
            print(col)
        if col not in feature_cate:
            clf = discretize.QuantileDiscretizer(
                quantiles=[20 * i for i in range(1, 5)],
                return_numeric=False,
                fill_na='Missing')
            data[col] = clf.fit_transform(data[col])
        value_count = Ynew.groupby(data[col]).count()
        if normalize:
            value_count = value_count / value_count.sum(axis=0)
        value_count = value_count.reindex(
            utils.sort(value_count.index.tolist(),
                       ascending=True,
                       pattern='\((.*?),',
                       converter=float))
        value_mean = Ynew.groupby(data[col]).mean()
        value_mean = value_mean.reindex(
            utils.sort(value_mean.index.tolist(),
                       ascending=True,
                       pattern='\((.*?),',
                       converter=float))

        fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
        value_count.plot(kind='bar', rot=30, ax=axes[0], legend=legend)
        value_mean.plot(kind='bar', rot=30, ax=axes[1], legend=legend)
        axes[0].set_xlabel('')
        axes[0].set_ylabel('Count of Samples')
        axes[0].set_title(col)

        axes[1].set_xlabel('')
        axes[1].set_ylabel('Mean of Y')

        fig_dict[col] = fig
        if close:
            plt.close('all')
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass

    return fig_dict
Пример #4
0
def test():
    #构建测试数据
    np.random.seed(13)
    X = pd.DataFrame(np.random.randn(10, 4),
                     columns=['cont1', 'cont2', 'cont3', 'cont4'])
    X['cate_two1'] = np.random.choice([0, 1], 10)
    X['cate_two2'] = np.random.choice([0, 1], 10)
    X['cate_mult1'] = np.random.choice([1, 2, 3, 4, 5], 10)
    X['cate_mult2'] = np.random.choice([1, 2, 3, 4, 5], 10)

    feature_cate_two = ['cate_two1', 'cate_two2']
    feature_cate_mult = ['cate_mult1', 'cate_mult2']

    #创建对象实例
    model = BasicDataStruct(X,
                            Y=None,
                            feature_cate_two=feature_cate_two,
                            feature_cate_mult=feature_cate_mult,
                            model_type='classification')
    print(model.current_state())

    #通过变换构造新变量,替换或者新增
    #新增
    x_exp = np.exp(model.X['cont1'])
    model.add(x_exp, replace=False, ignore=False, suffix='_exp')
    print(model.current_state())

    #替换
    model.add(np.sin(model.X[['cont2', 'cont3']]), replace=True)
    print(model.current_state())

    #连续变量离散化后新增
    import src.discretize as discretize
    clf = discretize.QuantileDiscretizer()
    model.add(clf.fit_transform(model.X['cont4']),
              feature_cate_mult=['cont4'],
              replace=False,
              ignore=False,
              suffix='dis')
    print(model.current_state())

    #连续变量离散化后替换
    clf = discretize.QuantileDiscretizer()
    model.add(clf.fit_transform(model.X[['cont1']]),
              feature_cate_mult=['cont1'],
              replace=True)
    print(model.current_state())

    #删除已有变量
    model.delete(features_todel=['cont1_exp', 'cate_mult1'])
    print(model.current_state())

    #重命名
    model.rename({'cont4dis': 'cont4_dis'})
    print(model.current_state())

    #改变离散型变量列表(增)
    model.addFeature(feature_cate_two=['cate_two1'],
                     feature_cate_mult=['cont1'])
    print(model.current_state())

    #改变离散型变量列表(删)
    model.delFeature(feature_cate_two=['cate_two1'],
                     feature_cate_mult=['cont1'])
    print(model.current_state())
Пример #5
0
for col in result:
    result[col].savefig(outfile+'%s.png'%col)

#两变量X与Y的分析图
outfile=path+'两变量X与Y的分析图/'

result=DataAnalysis.plot_doubleXY_Mean(model.X,Y_cate=model.Y,
                                       feature_cate=model.feature_cate_two+model.feature_cate_mult,
                                       backend='seaborn',close=True,show_last=True,verbose=True)

for cols in result:
    result[cols].savefig(outfile+'%s-%s.png'%(cols[0],cols[1]))


#%%提取离散化特征(缺失值会被当作一类)
clf_dis=discretize.QuantileDiscretizer(quantiles=[10*i for i in range(1,10)],fill_na='Missing',return_numeric=False)
Xnew_dis=clf_dis.fit_transform(model.X[model.feature_cont])

model_dis=BasicDataStruct.BasicDataStruct(X=Xnew_dis,Y=None,
                                          feature_cate_two=[],
                                          feature_cate_mult=Xnew_dis.columns.tolist(),
                                          model_type='classification')
print('连续特征离散化:')
print(model_dis.current_state())

#%%原始变量缺失填补
clf_imputer_cont=Imputer(strategy='mean')
clf_imputer_cate=Imputer(strategy='most_frequent')

if model.feature_cont!=[]:
    model.X[model.feature_cont]=clf_imputer_cont.fit_transform(model.X[model.feature_cont])
Пример #6
0
def plot_doubleXY_Mean(X,
                       cols_h=None,
                       cols_v=None,
                       Y_cont=None,
                       Y_cate=None,
                       feature_cate=None,
                       quantiles=None,
                       cuts=None,
                       pattern='\((.*?),',
                       str_nopattern=None,
                       fontsize=12,
                       backend='seaborn',
                       figsize=(18, 8),
                       close=True,
                       show_last=True,
                       verbose=False):
    '''
    功能: 两个变量X与Y(可以多个)的分析图。0-1离散型Y创建1的占比热力图;连续型Y创建均值热力图。本质上都是均值热力图。
    输入值: 
    X: 原始数据,dataframe类型
    cols_h: 水平轴选取字段,list类型,默认为data的所有列
    cols_v: 垂直轴选取字段,list类型,默认为data的所有列
    Y_cont: 连续型Y值,Series或一维np.array或DataFrame
    Y_cate: 0-1离散型Y值(暂时只能支持两类,且数值为0和1),Series或一维np.array或DataFrame
    feature_cate: 离散型X变量字段,list类型,默认为空
    quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[10*i for i in range(1,10)]
    cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles
    pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序
    str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序
    fontsize: int,字体大小
    backend: 画图后端,可选{'seaborn','matplotlib'}
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志。
    输出值: 
    fig_dict: X~Y关系图字典;键为二元组,第一个元素为水平轴字段名,第二个元素为垂直轴字段名,如('x1','x2');值为热力图对象
    '''
    data = X.copy()
    if cols_v is None:
        cols_v = list(data.columns)
    if cols_h is None:
        cols_h = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    if quantiles is None:
        quantiles = {}
    if cuts is None:
        cuts = {}
    if str_nopattern is None:
        str_nopattern = {}
    for key in quantiles:
        quantiles[key] = np.sort(np.unique(quantiles[key])).tolist()
    for key in cuts:
        cuts[key] = np.sort(np.unique(cuts[key]))
    q_default = [10 * i for i in range(1, 10)]
    #先对连续型变量离散化
    feature_cont = set(
        [col for col in cols_v + cols_h if col not in feature_cate])
    if len(feature_cont) > 0:
        for column in feature_cont:
            clf = discretize.QuantileDiscretizer(quantiles=quantiles.get(
                column, q_default),
                                                 return_numeric=False,
                                                 fill_na='Missing')
            if column in cuts.keys():
                clf.cuts = cuts[column]
            else:
                clf.fit(data[column])
            data[column] = clf.transform(data[column])
    data = data.fillna('Missing')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('Y值未给定!')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('连续型和离散型Y值只能给定一种!')
    if Y_cate is not None:
        Y = pd.DataFrame(Y_cate)
    else:
        Y = pd.DataFrame(Y_cont)
    fig_dict = {}
    n = Y.shape[1]
    cols_Y = list(Y.columns)
    cols_Y.sort()
    for vcol in cols_v:
        for hcol in cols_h:
            if verbose:
                print(vcol, hcol)
            if (vcol == hcol) or (vcol, hcol) in fig_dict.keys():
                continue
            fig, axes = plt.subplots(n, 1, figsize=figsize)
            if n == 1:
                axes = np.array([axes])
            for i, col in enumerate(cols_Y):
                value = Y[col].groupby([data[hcol],
                                        data[vcol]]).mean().unstack(hcol)
                if backend == 'seaborn':
                    value = value.reindex_axis(utils.sort(
                        value.index.tolist(),
                        ascending=False,
                        pattern=pattern,
                        str_nopattern=str_nopattern.get(vcol, None),
                        converter=float),
                                               axis=0)
                else:
                    value = value.reindex_axis(utils.sort(
                        value.index.tolist(),
                        ascending=True,
                        pattern=pattern,
                        str_nopattern=str_nopattern.get(vcol, None),
                        converter=float),
                                               axis=0)
                value = value.reindex_axis(utils.sort(
                    value.columns.tolist(),
                    ascending=True,
                    pattern=pattern,
                    str_nopattern=str_nopattern.get(hcol, None),
                    converter=float),
                                           axis=1)
                value = value.fillna(0)
                if i == 0:
                    title = 'Horizontal: %s <---> Vertical: %s\n%s' % (
                        hcol, vcol, col)
                else:
                    title = col
                if backend == 'seaborn':
                    if Y_cate is not None:
                        sns.heatmap(value, ax=axes[i], annot=True, fmt='.2%')
                    else:
                        sns.heatmap(value, ax=axes[i], annot=True, fmt='g')
                    axes[i].set_title(title, fontsize=fontsize)
                    axes[i].set_xlabel('')
                    axes[i].set_ylabel('')
                else:
                    pc, _ = heatmap(value,
                                    ax=axes[i],
                                    xlabel='',
                                    ylabel='',
                                    xticklabels=value.columns,
                                    yticklabels=value.index,
                                    title=title,
                                    fontsize=fontsize)
            if backend != 'seaborn':
                plt.colorbar(pc, ax=axes.ravel().tolist())
            plt.xticks(rotation=30)
            plt.yticks(rotation=30)
            fig_dict[(hcol, vcol)] = fig
            if close:
                plt.close('all')
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass
    return fig_dict
Пример #7
0
def plot_singleXY_Mean(X,
                       Y,
                       cols=None,
                       feature_cate=None,
                       normalize=True,
                       quantiles=None,
                       cuts=None,
                       pattern='\((.*?),',
                       str_nopattern=None,
                       ylabel=None,
                       fontsize=12,
                       figsize=(18, 8),
                       close=True,
                       show_last=True,
                       verbose=False):
    '''
    功能: 单一X与多个Y的分析图(Y均值)。0-1离散型Y创建数量和每个X类别中的1占比柱形图;连续型Y创建数量和Y均值柱形图。本质上都是均值柱形图。
    输入值: 
    X: 原始数据,dataframe类型
    Y: 连续型或0-1离散型Y值,Series或一维np.array或DataFrame
    cols: 选取字段,list类型,默认为rawdata的所有列
    feature_cate: 离散型X变量字段,list类型,默认为空
    normalize: 是否对样本数量作归一化(即使用样本占比)
    quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[20*i for i in range(1,5)]
    cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles
    pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序
    str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序
    ylabel: 二元列表,表示各个子图的纵轴标签,默认为['Count of Samples','Mean of Y']
    fontsize: int,字体大小
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志
    输出值: 
    fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value;
              上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图。
    '''
    data = X.copy()
    legend = True
    if str_nopattern is None:
        str_nopattern = {}
    if cols is None:
        cols = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    if quantiles is None:
        quantiles = {}
    if cuts is None:
        cuts = {}
    for key in quantiles:
        quantiles[key] = np.sort(np.unique(quantiles[key])).tolist()
    for key in cuts:
        cuts[key] = np.sort(np.unique(cuts[key]))
    q_default = [20 * i for i in range(1, 5)]
    fig_dict = {}
    Ynew = pd.DataFrame(Y)
    if isinstance(Y, np.ndarray) or len(Y.shape) == 1:
        legend = False
    if ylabel is None:
        ylabel = ['Count of Samples', 'Mean of Y']
    for i, col in enumerate(cols):
        if verbose:
            print(col)
        if col not in feature_cate:
            clf = discretize.QuantileDiscretizer(quantiles=quantiles.get(
                col, q_default),
                                                 return_numeric=False,
                                                 fill_na='Missing')
            if col in cuts.keys():
                clf.cuts = cuts[col]
            else:
                clf.fit(data[col])
            data[col] = clf.transform(data[col])
        data[col] = data[col].fillna('Missing')
        value_count = Ynew.groupby(data[col]).count()
        if normalize:
            value_count = value_count / value_count.sum(axis=0)
        value_count = value_count.reindex(
            utils.sort(value_count.index.tolist(),
                       ascending=True,
                       pattern=pattern,
                       str_nopattern=str_nopattern.get(col, None),
                       converter=float))
        value_mean = Ynew.groupby(data[col]).mean()
        value_mean = value_mean.reindex(
            utils.sort(value_mean.index.tolist(),
                       ascending=True,
                       pattern=pattern,
                       str_nopattern=str_nopattern.get(col, None),
                       converter=float))

        fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
        value_count.plot(kind='bar',
                         rot=30,
                         ax=axes[0],
                         legend=legend,
                         fontsize=fontsize)
        value_mean.plot(kind='bar',
                        rot=30,
                        ax=axes[1],
                        legend=legend,
                        fontsize=fontsize)
        axes[0].set_ylabel(ylabel[0], fontsize=fontsize)
        axes[0].set_title(col, fontsize=fontsize)

        axes[1].set_xlabel('')
        axes[1].set_ylabel(ylabel[1], fontsize=fontsize)

        fig_dict[col] = fig
        if close:
            plt.close('all')
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass

    return fig_dict
Пример #8
0
def plot_singleXY_PercentInY(X,
                             cols=None,
                             Y_cont=None,
                             Y_cate=None,
                             feature_cate=None,
                             quantiles=None,
                             cuts=None,
                             pattern='\((.*?),',
                             str_nopattern=None,
                             xlabel=None,
                             ylabel=None,
                             color_map=None,
                             legend_map=None,
                             fontsize=12,
                             figsize=(18, 8),
                             close=True,
                             show_last=True,
                             verbose=False):
    '''
    功能: 单一X与单一Y的分析图(Y组内比例)。0-1离散型Y创建数量和Y组内比例柱形图;连续型Y创建数量和Y均值柱形图
    输入值: 
    X: 原始数据,dataframe类型
    cols: 选取字段,list类型,默认为data的所有列
    Y_cont: 连续型Y值,Series或一维np.array
    Y_cate: 0-1离散型Y值,Series或一维np.array
    feature_cate: 离散型X变量字段,list类型,默认为空
    quantiles: dict,键为变量名,值为list或一维数组,用于指定连续变量离散化的分位点,默认所有连续变量的分位点为[20*i for i in range(1,5)]
    cuts:dict,键为变量名,值为list或一维数组,用于直接指定连续变量离散化的分割点,优先级高于quantiles
    pattern: 正则表达式,用于匹配横轴标签字符串,使其按照该正则表达式提取后的数值排序
    str_nopattern: 字典,键为变量名(或变量位置),值为列表,表示未匹配pattern字符串的正常顺序
    xlabel: 字符串,表示纵轴标签
    ylabel: 二元列表,表示各个子图的纵轴标签,离散型Y默认为['Count of Samples','Percent in Category of Y'],连续型Y默认为['Count of Samples','Mean of Y']
    color_map: 字典,表示离散型Y原始取值对应的柱形图颜色,如{1:'red',0:'blue'},只针对离散型Y。
    legend_map: 字典,表示离散型Y原始取值与图例的对应关系,如{1:'bad',0:'good'},只针对离散型Y。
    fontsize: int,字体大小
    close: 是否关闭生成的图
    show_last: 是否展示最后一幅图
    verbose: 是否打印日志
    输出值: 
    fig_dict: X~Y关系图字典;cols为key;fig(上下两个子图)为value;
              离散型Y上面那幅子图为数量柱形图,下面那幅子图为Y组内比例柱形图;
              连续型Y上面那幅子图为数量柱形图,下面那幅子图为Y均值柱形图
    '''
    data = X.copy()
    if cols is None:
        cols = list(data.columns)
    if feature_cate is None:
        feature_cate = []
    if quantiles is None:
        quantiles = {}
    if cuts is None:
        cuts = {}
    if str_nopattern is None:
        str_nopattern = {}
    if xlabel is None:
        xlabel = ''
    if legend_map is None:
        legend_map = {}
    if color_map is None:
        color_map = {}
    for key in quantiles:
        quantiles[key] = np.sort(np.unique(quantiles[key])).tolist()
    for key in cuts:
        cuts[key] = np.sort(np.unique(cuts[key]))
    q_default = [20 * i for i in range(1, 5)]
    fig_dict = {}
    fig_dict = fig_dict.fromkeys(cols)
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('Y值未给定!')
    if (Y_cont is None) and (Y_cate is None):
        raise Exception('连续型和离散型Y值只能给定一种!')
    if Y_cate is not None:
        Y_cate = pd.Series(Y_cate)
        if ylabel is None:
            ylabel = ['Count of Samples', 'Percent in Category of Y']
        for i, column in enumerate(cols):
            if verbose:
                print(column)
            if column not in feature_cate:
                clf = discretize.QuantileDiscretizer(quantiles=quantiles.get(
                    column, q_default),
                                                     return_numeric=False,
                                                     fill_na='Missing')
                if column in cuts.keys():
                    clf.cuts = cuts[column]
                else:
                    clf.fit(data[column])
                data[column] = clf.transform(data[column])
            data[column] = data[column].fillna('Missing')
            count = pd.crosstab(data[column], Y_cate)
            count.columns.name = ''
            count.index.name = column
            count = count.reindex(
                utils.sort(count.index.tolist(),
                           ascending=True,
                           pattern=pattern,
                           str_nopattern=str_nopattern.get(column, None),
                           converter=float))
            color = count.columns.map(
                lambda xx: color_map.get(xx, None)).tolist()
            count.columns = count.columns.map(
                lambda xx: legend_map.get(xx, xx))
            ratio = count / count.sum()

            fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
            count.plot(kind='bar',
                       ax=axes[0],
                       rot=0,
                       fontsize=fontsize,
                       color=color)
            axes[0].set_ylabel(ylabel[0], fontsize=fontsize)
            axes[0].set_title(column, fontsize=fontsize)
            axes[0].legend(loc='best', fontsize=fontsize)

            ratio.plot(kind='bar', ax=axes[1], rot=0, color=color)
            axes[1].set_xlabel(xlabel, fontsize=fontsize)
            axes[1].set_ylabel(ylabel[1], fontsize=fontsize)
            axes[1].legend(loc='best', fontsize=fontsize)

            if close:
                plt.close('all')
            fig_dict[column] = fig
    else:
        Y_cont = pd.Series(Y_cont)
        if ylabel is None:
            ylabel = ['Count of Samples', 'Mean of Y']
        for i, column in enumerate(cols):
            if verbose:
                print(column)
            if column not in feature_cate:
                clf = discretize.QuantileDiscretizer(quantiles=quantiles.get(
                    column, q_default),
                                                     return_numeric=False,
                                                     fill_na='Missing')
                if column in cuts.keys():
                    clf.cuts = cuts[column]
                else:
                    clf.fit(data[column])
                data[column] = clf.transform(data[column])
            data[column] = data[column].fillna('Missing')
            count = Y_cont.groupby(data[column]).count()
            count.name = ''
            count = count.reindex(
                utils.sort(count.index.tolist(),
                           ascending=True,
                           pattern=pattern,
                           str_nopattern=str_nopattern.get(column, None),
                           converter=float))

            ratio = Y_cont.groupby(data[column]).mean()
            ratio.name = ''
            ratio = ratio.reindex(
                utils.sort(ratio.index.tolist(),
                           ascending=True,
                           pattern='\((.*?),',
                           converter=float))

            fig, axes = plt.subplots(2, 1, sharex=True, figsize=figsize)
            count.plot(kind='bar', ax=axes[0], rot=0, fontsize=fontsize)
            axes[0].set_ylabel(ylabel[0], fontsize=fontsize)
            axes[0].set_title(column, fontsize=fontsize)
            axes[0].legend(loc='best', fontsize=fontsize)

            ratio.plot(kind='bar', ax=axes[1], rot=0, fontsize=fontsize)
            axes[1].set_xlabel(xlabel, fontsize=fontsize)
            axes[1].set_ylabel(ylabel[1], fontsize=fontsize)
            axes[1].legend(loc='best', fontsize=fontsize)

            if close:
                plt.close('all')
            fig_dict[column] = fig
    if show_last:
        try:
            fig
            fig.show()
        except:
            pass
    return fig_dict