Exemplos de drop_columns em Python, exemplos de data_clean_utils.drop_columns em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: data_clean_paying_taxes.py Projeto: BarryZM/BFSUIEEEISIWorldCup

def pre_clean():
    file_name = u'一般纳税人'
    dcu.merge_status(file_name,
                     u'认定日期'.encode('utf-8'), [], [],
                     empty_mask='0000-00-00')
    dcu.merge_status(file_name,
                     u'纳税人状态'.encode('utf-8'), [], [],
                     empty_mask='Unknown')
    dcu.merge_status(file_name,
                     u'出口状态备案状态'.encode('utf-8'), [], [],
                     empty_mask='Unknown')
    dcu.merge_status(file_name,
                     u'登记注册类型'.encode('utf-8'), [], [],
                     empty_mask='Unknown')
    dcu.merge_status(file_name,
                     u'纳税人资格'.encode('utf-8'), [], [],
                     empty_mask='Unknown')

    dcu.drop_columns(file_name, u'有效日期期起'.encode('utf-8'))
    dcu.drop_columns(file_name, u'有效截止日期'.encode('utf-8'))
    dcu.drop_columns(file_name, u'是否具有一般纳税人资格'.encode('utf-8'))
    dcu.drop_columns(file_name, u'扣缴义务'.encode('utf-8'))
    dcu.drop_columns(file_name, u'是否按季申报'.encode('utf-8'))

    return

Exemplo n.º 2

0

Exibir arquivo

def clean_gdscjytddy():
    file_name = u'购地-市场交易-土地抵押'



    dcu.drop_columns(file_name, u'土地抵押人性质'.encode('utf-8'))

    dcu.merge_status(file_name, u'抵押面积(公顷)'.encode('utf-8'), [], [], empty_mask='0')
    dcu.merge_status(file_name, u'土地面积'.encode('utf-8'), [], [], empty_mask='0')
    dcu.merge_status(file_name, u'评估金额(万元)'.encode('utf-8'), [], [], empty_mask='0')
    dcu.merge_status(file_name, u'抵押金额(万元)'.encode('utf-8'), [], [], empty_mask='0')
    dcu.merge_status(file_name, u'土地抵押登记结束时间'.encode('utf-8'), [], [], empty_mask='0000-00-00')

    time_rearranged(file_name, u'土地抵押登记起始时间'.encode('utf-8'), i = 0)
    time_split(file_name, 'time0'.encode('utf-8'), i = 0)
    time_rearranged(file_name, u'土地抵押登记结束时间'.encode('utf-8'), i = 1)
    time_split(file_name, 'time1'.encode('utf-8'), i=1)

    land_usage(file_name, u'土地用途'.encode('utf-8'))
    land_usage(file_name, u'抵押土地用途'.encode('utf-8'))

    dcu.extract_keyword(file_name, u'抵押土地权属性质与使用权类型'.encode('utf-8'), [u'国有', u'集体'], empty_mask='-1', others_mask='3')
    land_status(file_name, u'抵押土地权属性质与使用权类型'.encode('utf-8')) # 1：国有，2：集体，3：others，-1：Unknown

    return

Exemplo n.º 3

0

Exibir arquivo

Arquivo: data_clean_soft_assets.py Projeto: BarryZM/BFSUIEEEISIWorldCup

def empty_value_handle_trademark():
    """
    Dirty value handle for table 商标.xlsx.
    First we'll drop rows that empty value is too many.
    # ['主营业务收入','净利润','利润总额','所有者权益合计', '纳税总额','营业总收入','负债总额','资产总额']
    # Once there are more than 3 empties in these 8 columns we will drop that row.
    Then we check nulls column by column and decide how to process with it.
    Next we should numeric all the value for future process.
    After these are done, it's time to work out features we can use in this table which belongs
        to exploratory data analysis.

    -----------------------------
    商标状态
    ------
    Empty percentage is 0.2597%(367 out of 141312). We replace them as 'Unknown'.

    -----------------------------
    申请日期
    ------
    Empty percentage is 0.3637%(514 out of 141312). We replace with '1000-01-01'.
    Others are well formatted.

    -----------------------------
    专用权期限开始日期
    ------
    All empty, drop it.

    -----------------------------
    专用权期限结束日期
    ------
    Empty percentage is 21.4922%(30371 out of 141312). This column's value can be extract from '商标使用期限时间段', so we
    drop it.
    -----------------------------
    商标使用期限时间段
    ------
    Empty percentage is 1.5915%(2249 out of 141312). We map them to '1000-01-01至1000-01-01'.
    Others are well formatted except some are '至', for these value we change to '1000-01-01至1000-01-01'.

    -----------------------------
    :return:
    """
    df = file_utils.read_file_to_df(clean_data_temp_file_url, u'商标')
    values = {u'商标状态'.encode('utf-8'): 'Unknown', u'申请日期'.encode('utf-8'): '1000-01-01',
              u'商标使用期限时间段'.encode('utf-8'): u'1000-01-01至1000-01-01'}
    df = df.fillna(values)
    file_utils.write_file(df, clean_data_temp_file_url, u'商标')

    dcu.drop_columns(u'商标', [u'专用权期限开始日期'.encode('utf-8')])
    dcu.drop_columns(u'商标', [u'专用权期限结束日期'.encode('utf-8')])

    status_1 = [u'至']
    status_list = [status_1]
    status_after = [u'1000-01-01至1000-01-01']

    dcu.merge_status(u'商标', u'商标使用期限时间段'.encode('utf-8'), status_list, status_after)
    return

Exemplo n.º 4

0

Exibir arquivo

def clean_gddkgs():
    file_name = u'购地-地块公示'

    time_rearranged(file_name, u'时间'.encode('utf-8'))
    time_split(file_name, 'time0', i = 0)


    dcu.drop_columns(file_name, u'行政区'.encode('utf-8'))
    dcu.drop_columns(file_name, u'出让年限'.encode('utf-8'))
    dcu.drop_columns(file_name, u'土地使用条件'.encode('utf-8'))
    dcu.drop_columns(file_name, u'公示日期'.encode('utf-8'))
    dcu.drop_columns(file_name, u'成交价（万元）'.encode('utf-8'))

    land_usage(file_name, u'土地用途'.encode('utf-8'))

    return

Exemplo n.º 5

0

Exibir arquivo

def clean_bond():
    file_name = u'债券信息'
    dcu.merge_status(file_name, u'债券信用评级'.encode('utf-8'), [], [], empty_mask='Unknown')
    dcu.merge_status(file_name, u'付息日期'.encode('utf-8'), [], [], empty_mask='00-00')
    dcu.merge_status(file_name, u'兑付日期'.encode('utf-8'), [], [], empty_mask='0000-00-00')
    dcu.merge_status(file_name, u'主体信用评级'.encode('utf-8'), [], [], empty_mask='Unknown')  # 空值改为Unknown
    dcu.merge_status(file_name, u'债券品种'.encode('utf-8'),[], [], empty_mask='Unknown')
    dcu.merge_status(file_name, u'付息方式'.encode('utf-8'), [], [], empty_mask='Unknown')


    dcu.drop_unit(file_name, u'债券期限'.encode('utf-8'), [u'年'], empty_mask= -1)

    wr1 = fu.read_file_to_df(clean_data_temp_file_url,file_name,
                             sheet_name='Sheet')
    wr1 = wr1.fillna({u'纳税人资格'.encode('utf-8'): 'unknown'})  # 对空值进行处理以进行索引
    fu.write_file(wr1, clean_data_temp_file_url, file_name, ext='.xlsx',
                  sheet_name='Sheet', index=False)

    wr1 = fu.read_file_to_df(clean_data_temp_file_url, file_name,
                             sheet_name='Sheet')
    wr1 = wr1.fillna({u'票面利率（%）'.encode('utf-8'): 'unknown'})  # 对空值进行处理以进行索引
    fu.write_file(wr1, clean_data_temp_file_url, file_name, ext='.xlsx',
                  sheet_name='Sheet', index=False)


    dcu.drop_columns(file_name, u'币种'.encode('utf-8'))
    dcu.drop_columns(file_name, u'流通场所'.encode('utf-8'))
    dcu.drop_columns(file_name, u'实际发行总额（亿元）'.encode('utf-8'))


    ranking_of_bond(file_name, u'债券信用评级'.encode('utf-8'))
    kind_of_bond(file_name, u'债券品种'.encode('utf-8'))
    ranking_of_co(file_name, u'主体信用评级'.encode('utf-8'))
    interest_pay(file_name, u'付息方式'.encode('utf-8'))

    time_rearranged(file_name, u'发行日期'.encode('utf-8'), i = 0)
    time_rearranged(file_name, u'兑付日期'.encode('utf-8'), i = 1)


    return

Exemplo n.º 6

0

Exibir arquivo

def clean_basic_info():
    file_name = u'工商基本信息表'

    dcu.drop_prefix_unit(file_name, u'行业小类（代码）'.encode('utf-8'), 'C')
    dcu.drop_prefix_unit(file_name, u'行业小类（代码）'.encode('utf-8'), 'J')

    dcu.merge_status(file_name, u'注册资本币种(正则)'.encode('utf-8'), [], [], empty_mask='Unknown')
    dcu.merge_status(file_name, u'经营状态'.encode('utf-8'), [], [], empty_mask='Unknown')
    dcu.merge_status(file_name, u'行业大类（代码）'.encode('utf-8'), [], [], empty_mask='Unknown')
    dcu.merge_status(file_name,  u'行业小类（代码）'.encode('utf-8'), [], [], empty_mask='-1')
    dcu.merge_status(file_name, u'成立日期'.encode('utf-8'), [], [], empty_mask='0000-00-00')
    dcu.merge_status(file_name, u'员工人数'.encode('utf-8'), [], [], empty_mask='0')
    dcu.merge_status(file_name, u'注销原因'.encode('utf-8'), [], [], empty_mask='-1')
    dcu.merge_status(file_name, u'注销时间'.encode('utf-8'), [], [], empty_mask='-1')

    time_rearranged(file_name, u'成立日期'.encode('utf-8'))

    dcu.drop_columns(file_name, u'城市代码'.encode('utf-8'))
    dcu.drop_columns(file_name, u'地区代码'.encode('utf-8'))
    dcu.drop_columns(file_name, u'登记机关区域代码'.encode('utf-8'))
    dcu.drop_columns(file_name, u'发照日期'.encode('utf-8'))
    dcu.drop_columns(file_name, u'经营期限自'.encode('utf-8'))
    dcu.drop_columns(file_name, u'经营期限至'.encode('utf-8'))

    money_kind(file_name,  u'注册资本币种(正则)'.encode('utf-8'))
    industry_category(file_name, u'行业大类（代码）'.encode('utf-8'))
    status_of_company(file_name, u'经营状态'.encode('utf-8'))
    kind_of_company(file_name, u'类型'.encode('utf-8'))
    whether_on_stock_market(file_name, u'是否上市'.encode('utf-8'))
    reason_log_out(file_name, u'注销原因'.encode('utf-8'))

    return

Exemplo n.º 7

0

Exibir arquivo

def data_clean_finance_ylnlzb():
    """
            Dirty value handle for table 上市信息财务信息盈利能力指标.xlsx.
    
        ['企业总评分','标题','日期','加权净资产收益率(%)','摊薄净资产收益率(%)','摊薄总资产收益率(%)','毛利率(%)','净利率(%)','实际税率(%)']

    
       -----------------------------
        标题
        ------
        drop this column
        -----------------------------
        日期
        ------
        no change
        -----------------------------
        加权净资产收益率(%)
        ------
        turn '--%' into 'NA'
        -----------------------------
        摊薄净资产收益率(%)
        ------
        turn '--%' into 'NA'
        -----------------------------
        摊薄总资产收益率(%)
        ------
        turn '--%' into 'NA'
        -----------------------------
        毛利率(%)
        ------
        turn '--%' into 'NA'
        if >100%
            turn into 'ERROR'
        -----------------------------
        净利率(%)
        ------
        turn '--%' into 'NA'
        if >100%
            turn into 'ERROR'
        -----------------------------
        实际税率(%)
        ------
        turn '--%' into 'NA'
        if >100%
            turn into 'ERROR'
        -----------------------------
    """
    dcu.drop_columns(u'上市信息财务信息盈利能力指标', u'标题')
    dcu.adjust_time(u'上市信息财务信息盈利能力指标', u'日期')

    status_normal = [u'--%']  # 搜索满足这个条件的
    status_list = [status_normal]
    status_after = ['Unknown']  # 改成这个

    dcu.merge_status(u'上市信息财务信息盈利能力指标',
                     u'加权净资产收益率(%)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息盈利能力指标',
                     u'摊薄净资产收益率(%)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息盈利能力指标',
                     u'摊薄总资产收益率(%)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息盈利能力指标',
                     u'毛利率(%)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息盈利能力指标',
                     u'净利率(%)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息盈利能力指标',
                     u'实际税率(%)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')

    unit_strs = [u'%']
    dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'加权净资产收益率(%)', unit_strs)
    dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'摊薄净资产收益率(%)', unit_strs)
    dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'摊薄总资产收益率(%)', unit_strs)
    dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'毛利率(%)', unit_strs)
    dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'净利率(%)', unit_strs)
    dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'实际税率(%)', unit_strs)

    # 标记不合理的数据
    #    dcu.mark_invalid_num_data(u'temp', u'a', '>', 100, error_mask='-65535')
    dcu.mark_invalid_num_data(u'上市信息财务信息盈利能力指标',
                              u'毛利率(%)'.encode('utf-8'),
                              '>',
                              100,
                              error_mask='-65535')
    dcu.mark_invalid_num_data(u'上市信息财务信息盈利能力指标',
                              u'净利率(%)'.encode('utf-8'),
                              '>',
                              100,
                              error_mask='-65535')
    dcu.mark_invalid_num_data(u'上市信息财务信息盈利能力指标',
                              u'实际税率(%)'.encode('utf-8'),
                              '>',
                              100,
                              error_mask='-65535')

    return

Exemplo n.º 8

0

Exibir arquivo

def data_clean_finance_xjllb():
    """
        Dirty value handle for table 上市信息财务信息-现金流量表.xlsx.
    
    
        ['企业总评分','标题','日期','经营:销售商品、提供劳务收到的现金(元)','经营:收到的税费返还(元)','经营:收到其他与经营活动有关的现金(元)',
            '经营:经营活动现金流入小计(元)','经营:购买商品、接受劳务支付的现金(元)','经营:支付给职工以及为职工支付的现金(元)',
            '经营:支付的各项税费(元)','经营:支付其他与经营活动有关的现金(元)','经营:经营活动现金流出小计(元)','经营:经营活动产生的现金流量净额(元)',
            '投资:取得投资收益所收到的现金(元)','投资:处置固定资产、无形资产和其他长期资产收回的现金净额(元)','投资:投资活动现金流入小计(元)',
            '投资:购建固定资产、无形资产和其他长期资产支付的现金(元)','投资:投资支付的现金(元)','投资:投资活动现金流出小计(元)',
            '投资:投资活动产生的现金流量净额(元)','筹资:吸收投资收到的现金(元)','筹资:取得借款收到的现金(元)','筹资:筹资活动现金流入小计(元)',
            '筹资:偿还债务支付的现金(元)','筹资:分配股利、利润或偿付利息支付的现金(元)','筹资:筹资活动现金流出小计(元)','筹资活动产生的现金流量净额(元)']
    
    
    
        -----------------------------
        标题
        ------
        drop this column
        -----------------------------
        日期
        ------
        no change
        -----------------------------
        经营:销售商品、提供劳务收到的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        经营:收到的税费返还(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        经营:收到其他与经营活动有关的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        经营:经营活动现金流入小计(元)
        ------
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        经营:购买商品、接受劳务支付的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        经营:支付给职工以及为职工支付的现金(元)
        ------
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        经营:支付的各项税费(元)
        ------
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        经营:支付其他与经营活动有关的现金(元)
        ------
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        经营:经营活动现金流出小计(元)
        ------
        turn '--' into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        经营:经营活动产生的现金流量净额(元)
        ------
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        投资:取得投资收益所收到的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        投资:处置固定资产、无形资产和其他长期资产收回的现金净额(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        投资:投资活动现金流入小计(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        投资:购建固定资产、无形资产和其他长期资产支付的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        投资:投资支付的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        投资:投资活动现金流出小计(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        投资:投资活动产生的现金流量净额(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        筹资:吸收投资收到的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        筹资:取得借款收到的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        筹资:筹资活动现金流入小计(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        筹资:偿还债务支付的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        筹资:分配股利、利润或偿付利息支付的现金(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        筹资:筹资活动现金流出小计(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        筹资活动产生的现金流量净额(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
    """
    dcu.drop_columns(u'上市信息财务信息-现金流量表', u'标题'.encode('utf-8'))
    dcu.adjust_time(u'上市信息财务信息-现金流量表', u'日期'.encode('utf-8'))

    status_normal = [u'--']  # 搜索满足这个条件的
    status_list = [status_normal]
    status_after = ['Unknown']  # 改成这个

    # 循环的写法
    # data_frame = file_utils.read_file_to_df(clean_data_temp_file_url,u'上市信息财务信息-现金流量表')
    #
    # for column in data_frame.columns:
    #     dcu.merge_status(u'上市信息财务信息-现金流量表', column, status_list, status_after)
    #     dcu.change_number(u'上市信息财务信息-现金流量表', column)

    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:销售商品、提供劳务收到的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:收到的税费返还(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:收到其他与经营活动有关的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:经营活动现金流入小计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:购买商品、接受劳务支付的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:支付给职工以及为职工支付的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:支付的各项税费(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:支付其他与经营活动有关的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:经营活动现金流出小计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'经营:经营活动产生的现金流量净额(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'投资:取得投资收益所收到的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'投资:处置固定资产、无形资产和其他长期资产收回的现金净额(元)',
                     status_list, status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'投资:投资活动现金流入小计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'投资:购建固定资产、无形资产和其他长期资产支付的现金(元)',
                     status_list, status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'投资:投资支付的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'投资:投资活动现金流出小计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'投资:投资活动产生的现金流量净额(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'筹资:吸收投资收到的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'筹资:取得借款收到的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'筹资:筹资活动现金流入小计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'筹资:偿还债务支付的现金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'筹资:分配股利、利润或偿付利息支付的现金(元)',
                     status_list, status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'筹资:筹资活动现金流出小计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息-现金流量表', u'筹资活动产生的现金流量净额(元)', status_list,
                     status_after)
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:销售商品、提供劳务收到的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:收到的税费返还(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:收到其他与经营活动有关的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:经营活动现金流入小计(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:购买商品、接受劳务支付的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:支付给职工以及为职工支付的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:支付的各项税费(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:支付其他与经营活动有关的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:经营活动现金流出小计(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'经营:经营活动产生的现金流量净额(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'投资:取得投资收益所收到的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'投资:处置固定资产、无形资产和其他长期资产收回的现金净额(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'投资:投资活动现金流入小计(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'投资:购建固定资产、无形资产和其他长期资产支付的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'投资:投资支付的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'投资:投资活动现金流出小计(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'投资:投资活动产生的现金流量净额(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'筹资:吸收投资收到的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'筹资:取得借款收到的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'筹资:筹资活动现金流入小计(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'筹资:偿还债务支付的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'筹资:分配股利、利润或偿付利息支付的现金(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'筹资:筹资活动现金流出小计(元)')
    dcu.change_number(u'上市信息财务信息-现金流量表', u'筹资活动产生的现金流量净额(元)')

    return

Exemplo n.º 9

0

Exibir arquivo

def data_clean_finance_mgzb():
    """
            Dirty value handle for table 上市公司财务信息-每股指标.xlsx.
        # ['企业总评分','标题','日期','基本每股收益(元)', '扣非每股收益(元)','稀释每股收益(元)',
        '每股净资产(元)','每股公积金(元)','每股未分配利润(元)','每股经营现金流(元)']
        In this table, we turn all the '--' and nulls into 'NA'. Valid data are all in the form of double(float)

        -----------------------------
       企业总评分(企业编号)
        ------
        no change
        -----------------------------
        标题
        ------
        drop this column
        -----------------------------
        日期
        ------
        no change
        -----------------------------
        基本每股收益(元)
        ------
        turn '--' into 'NA'
        -----------------------------
        扣非每股收益(元)
        ------
        turn '--' into 'NA'
        -----------------------------
        稀释每股收益(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        -----------------------------
        每股净资产(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        -----------------------------
        每股公积金(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        -----------------------------
        每股未分配利润(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        -----------------------------
        每股经营现金流(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        -----------------------------
    """
    dcu.drop_columns(u'上市公司财务信息-每股指标', u'标题'.encode('utf-8'))
    dcu.adjust_time(u'上市公司财务信息-每股指标', u'日期'.encode('utf-8'))
    dcu.merge_rows_by_columns(u'上市公司财务信息-每股指标',
                              keys=[corporate_index_false, u'日期'])

    # dcu.drop_columns(u'temp', u'c')
    # dcu.change_number('temp','a')

    status_normal = [u'--']  # 搜索满足这个条件的
    status_list = [status_normal]
    status_after = ['Unknown']  # 改成这个
    dcu.merge_status(u'上市公司财务信息-每股指标', u'基本每股收益(元)', status_list, status_after)
    dcu.merge_status(u'上市公司财务信息-每股指标', u'扣非每股收益(元)', status_list, status_after)
    dcu.merge_status(u'上市公司财务信息-每股指标',
                     u'稀释每股收益(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')  # 空值改为Unknown
    dcu.merge_status(u'上市公司财务信息-每股指标',
                     u'每股净资产(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市公司财务信息-每股指标',
                     u'每股公积金(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市公司财务信息-每股指标',
                     u'每股未分配利润(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市公司财务信息-每股指标',
                     u'每股经营现金流(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    return

Exemplo n.º 10

0

Exibir arquivo

def data_clean_finance_lrb():
    """
        Dirty value handle for table 上市信息财务信息-利润表.xlsx.
    First we'll drop rows that empty value is too many.
   ['企业总评分','标题','日期','营业收入(元)','营业成本(元)','销售费用(元)','财务费用(元)',
   '管理费用(元)','资产减值损失(元)','投资收益(元)','营业利润(元)','利润总额(元)','所得税(元)','归属母公司所有者净利润(元)']


    -----------------------------
    标题
    ------
    drop this column
    -----------------------------
    日期
    ------
    no change
    -----------------------------
    营业收入(元)
    ------
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    营业成本(元)
    ------
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    销售费用(元)
    ------
    turn '--' into 'NA'
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    财务费用(元)
    ------
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    管理费用(元)
    ------
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    资产减值损失(元)
    ------
    turn '--' into 'NA'
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    投资收益(元)
    ------
    turn '--' into 'NA'
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    营业利润(元)
    ------
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    利润总额(元)
    ------
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    所得税(元)
    ------
    turn '--' into 'NA'
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    归属母公司所有者净利润(元)
    ------
    turn null into 'NA'
    if end with u'万'
        drop u'万'
        *10^4
    if end with u'亿'
        drop u'亿'
        *10^8

    -----------------------------
    """
    dcu.drop_columns(u'上市信息财务信息-利润表', u'标题'.encode('utf-8'))
    dcu.adjust_time(u'上市信息财务信息-利润表', u'日期'.encode('utf-8'))

    status_normal = [u'--', u'--%']  # 搜索满足这个条件的
    status_list = [status_normal]
    status_after = ['Unknown']  # 改成这个

    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'营业收入(元)', [],
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'营业成本(元)', [],
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'销售费用(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'财务费用(元)', [],
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'管理费用(元)', [],
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'资产减值损失(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'投资收益(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'营业利润(元)', [],
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'利润总额(元)', [],
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'所得税(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-利润表',
                     u'归属母公司所有者净利润(元)',
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.change_number(u'上市信息财务信息-利润表', u'营业收入(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'营业成本(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'销售费用(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'财务费用(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'管理费用(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'资产减值损失(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'投资收益(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'营业利润(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'利润总额(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'所得税(元)')
    dcu.change_number(u'上市信息财务信息-利润表', u'归属母公司所有者净利润(元)')

    return

Exemplo n.º 11

0

Exibir arquivo

def data_clean_finance_cwfxzb():
    """
        Dirty value handle for table 上市信息财务信息-财务风险指标.xlsx.
        First we'll drop rows that empty value is too many.
        # ['企业总评分','标题','日期','资产负债率(%)','流动负债/总负债(%)','流动比率','速动比率']
        In this table, we turn all the '--' and nulls into 'NA'. Valid data are all in the form of double(float)


        -----------------------------
        标题
        ------
        drop this column
        -----------------------------
        日期
        ------
        no change
        -----------------------------
        资产负债率(%)
        ------
        turn '--%' into 'NA'
        -----------------------------
        流动负债/总负债(%)
        ------
        turn '--%' into 'NA'
        -----------------------------
        流动比率
        ------
        turn '--' into 'NA'
        -----------------------------
        速动比率
        ------
        turn '--' into 'NA'
        -----------------------------
    """
    dcu.drop_columns(u'上市信息财务信息-财务风险指标', u'标题'.encode('utf-8'))
    dcu.adjust_time(u'上市信息财务信息-财务风险指标', u'日期'.encode('utf-8'))

    status_normal = [u'--', u'--%']  # 搜索满足这个条件的
    status_list = [status_normal]
    status_after = ['Unknown']  # 改成这个

    dcu.merge_status(u'上市信息财务信息-财务风险指标',
                     u'资产负债率(%)'.encode('utf-8'),
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-财务风险指标',
                     u'流动负债/总负债(%)'.encode('utf-8'),
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-财务风险指标',
                     u'流动比率'.encode('utf-8'),
                     status_list,
                     status_after,
                     empty_mask='Unknown')
    dcu.merge_status(u'上市信息财务信息-财务风险指标',
                     u'速动比率'.encode('utf-8'),
                     status_list,
                     status_after,
                     empty_mask='Unknown')

    # 去百分号
    # dcu.drop_unit(u'temp', u'a', unit_strs)
    unit_strs = [u'%']
    dcu.drop_unit(u'上市信息财务信息-财务风险指标', u'资产负债率(%)'.encode('utf-8'), unit_strs)
    dcu.drop_unit(u'上市信息财务信息-财务风险指标', u'流动负债/总负债(%)'.encode('utf-8'),
                  unit_strs)
    return

Exemplo n.º 12

0

Exibir arquivo

def data_clean_landing_purchase_jggg():
    """
                Dirty value handle for table 购地-结果公告.xlsx.
        First we'll drop rows that empty value is too many.
        ['总面积','土地用途','供应方式','签订日期','土地来源','土地使用年限','行业分类','土地级别','成交价格（万元）',
        '约定容积率上限','约定容积率下限','约定交地时间','约定开工时间','约定竣工时间','实际开工时间','实际竣工时间','批准单位']

        -----------------------------
        总面积
        ------
        turn null into 'Unknown'
        -----------------------------
        土地用途
        ------
        turn into several integers
        -----------------------------
        供应方式
        ------
        97% null,  turn null into 'Unknown',turn into several integers
        -----------------------------
        签订日期
        ------
        turn into integers
        -----------------------------
        土地来源
        ------
        99% null,drop
        -----------------------------
        土地使用年限
        ------
        96% null, turn null into 'Unknown', clean wrong numbers
        -----------------------------
        行业分类
        ------
        99% null, drop
        -----------------------------
        土地级别
        ------
        the smaller the number, the more expensive the land, 96% null
        -----------------------------
        成交价格（万元）
        ------
        86% null, turn null into 'Unknown'

        -----------------------------
        约定容积率上限
        ------
        99% null, turn null into 'Unknown'
        -----------------------------
        约定容积率下限
        ------
        99% null, turn null into 'Unknown'
        -----------------------------
        约定交地时间
        ------
        99% null, turn into integers
        -----------------------------
        约定开工时间
        ------
        97% null, turn into integers
        -----------------------------
        约定竣工时间
        ------
        99% null, turn into integers
        -----------------------------
        实际开工时间
        ------
        100% null, drop
        -----------------------------
        实际竣工时间
        ------
        100% null, drop
        -----------------------------
        批准单位
        ------
        99% null, drop
        -----------------------------
        """
    file_name = u'购地-结果公告'

    dcu.drop_columns(file_name, u'约定开工时间'.encode('utf-8'))
    dcu.drop_columns(file_name, u'约定竣工时间'.encode('utf-8'))
    dcu.drop_columns(file_name, u'实际开工时间'.encode('utf-8'))
    dcu.drop_columns(file_name, u'实际竣工时间'.encode('utf-8'))
    dcu.drop_columns(file_name, u'约定交地时间'.encode('utf-8'))
    dcu.drop_columns(file_name, u'供应方式'.encode('utf-8'))
    dcu.drop_columns(file_name, u'批准单位'.encode('utf-8'))
    dcu.drop_columns(file_name, u'成交价格（万元）'.encode('utf-8'))
    dcu.drop_columns(file_name, u'约定容积率上限'.encode('utf-8'))
    dcu.drop_columns(file_name, u'约定容积率下限'.encode('utf-8'))
    dcu.drop_columns(file_name, u'土地级别'.encode('utf-8'))
    dcu.drop_columns(file_name, u'行业分类'.encode('utf-8'))
    dcu.drop_columns(file_name, u'土地来源'.encode('utf-8'))
    dcu.drop_columns(file_name, u'土地使用年限'.encode('utf-8'))

    # 签订日期：需要去掉小数部分
    time_rearranged(file_name, u'签订日期'.encode('utf-8'), i=0)
    time_split(file_name, 'time0', i=0)

    land_usage(file_name, u'土地用途'.encode('utf-8'))

    dcu.merge_status(file_name, u'总面积'.encode('utf-8'), [], [],empty_mask= 0)
    return

Exemplo n.º 13

0

Exibir arquivo

def data_clean_finance_yynlzb():
    """
        Dirty value handle for table 上市信息财务信息运营能力指标.xlsx.
    
        ['企业总评分','标题','日期','总资产周转率(次)','应收账款周转天数(天)','存货周转天数(天)']
    
    
        -----------------------------
        标题
        ------
        drop this column
        -----------------------------
        日期
        ------
        no change
        -----------------------------
        总资产周转率(次)
        ------
        turn null into 'NA'
        -----------------------------
        应收账款周转天数(天)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if <0
            turn into 'ERROR'
        -----------------------------
        存货周转天数(天)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if <0
            turn into 'ERROR'
        -----------------------------
        """
    dcu.drop_columns(u'上市信息财务信息运营能力指标', u'标题'.encode('utf-8'))
    dcu.adjust_time(u'上市信息财务信息运营能力指标', u'日期'.encode('utf-8'))

    status_normal = [u'--']  # 搜索满足这个条件的
    status_list = [status_normal]
    status_after = ['Unknown']  # 改成这个

    # 第一行本来是数值型的，只能重新赋值成数值型的
    dcu.merge_status(u'上市信息财务信息运营能力指标',
                     u'总资产周转率(次)'.encode('utf-8'),
                     status_list,
                     status_after,
                     empty_mask='-65535')
    dcu.merge_status(u'上市信息财务信息运营能力指标', u'应收账款周转天数(天)'.encode('utf-8'),
                     status_list, status_after)
    dcu.merge_status(u'上市信息财务信息运营能力指标', u'存货周转天数(天)'.encode('utf-8'),
                     status_list, status_after)

    dcu.mark_invalid_num_data(u'上市信息财务信息运营能力指标',
                              u'应收账款周转天数(天)'.encode('utf-8'),
                              '<',
                              0,
                              error_mask=-65535)
    dcu.mark_invalid_num_data(u'上市信息财务信息运营能力指标',
                              u'存货周转天数(天)'.encode('utf-8'),
                              '<',
                              0,
                              error_mask=-65535)

    return

Exemplo n.º 14

0

Exibir arquivo

def clean_gdscjytdzr():
    file_name = u'购地-市场交易-土地转让'
    time_rearranged(file_name, u'成交时间'.encode('utf-8'), i = 0)
    time_split(file_name, 'time0', i = 0)

    dcu.drop_columns(file_name, u'土地使用年限'.encode('utf-8'))
    dcu.drop_columns(file_name, u'土地级别'.encode('utf-8'))
    dcu.drop_columns(file_name, u'土地使用权类型'.encode('utf-8'))
    dcu.drop_columns(file_name, u'土地利用状况'.encode('utf-8'))
    dcu.drop_columns(file_name, u'转让方式'.encode('utf-8'))
    dcu.drop_columns(file_name, u'转让价格(万元)'.encode('utf-8'))

    dcu.merge_status(file_name, u'土地面积(公顷)'.encode('utf-8'), [], [], empty_mask='0')

    land_usage(file_name, u'土地用途'.encode('utf-8'))

    return

Exemplo n.º 15

0

Exibir arquivo

def empty_value_handle_basic_info():
    """
    empty_value handle for table 年报-企业基本信息.
        Dirty value handle for table 年报-企业基本信息.
    First we'll drop rows that empty value is too many.
    ['企业经营状态','从业人数','是否有网站或网点','企业是否有投资信息或购买其他公司股权',
        '有限责任公司本年度是否发生股东股权转','是否提供对外担保']
    Once there are more than 3 empties in these 6 columns we will drop that row.
    Then we check nulls column by column and decide how to process with it.
    Next we should numeric all the value for future process.
    After these are done, it's time to work out features we can use in this table which belongs
        to exploratory data analysis.

    -----------------------------
    注册资本
    ------
    Based on the primary analysis data, we can drop column 注册资本 which empty percentage is 88%
    -----------------------------
    企业经营状态
    ------
    Empty percentage is 0%(1 out of 14862).
    8 status this value has, they are ['停业','其他','存续','开业','开业/正常经营','歇业','正常开业','清算'].
    We just add another status for the empty value:'Unknown'.
    And based on the counts for every status, we simplify these status to ['正常经营','非正常经营','Unknown']
    ['开业','开业/正常经营','正常开业'] belongs to '正常经营' and ['停业','其他','存续','歇业','清算'] belongs to '非正常经营'.
    So we can map these total 9 status to three: {'正常经营':0,'非正常经营':1,'Unknown':-1}.
    -----------------------------
    从业人数
    ------
    Empty percentage is 0%(0 out of 14862), and some value end with '人' while some are pure number.
    But also there are lots of value valued '企业选择不公示'(11623) and a few valued '人' without number.
    For empty value, we replace with -1 indicating there's no value(be careful here, we don't trigger them as -1 people,
        -1 here works as a status). Those end with '人', we simply drop '人'. Those valued '企业选择不公示',
        we replace it as number 0 which also works as a status, there's 8 '0人's in the original value but
        shouldn't matter.
    -----------------------------
    是否有网站或网点
    ------
    Empty percentage is 0%(0 out of 14862).
    There are 4 status here:['否','无','是','有'], and ['否','无'] should belong to 'No', ['是','有'] belong to 'Yes'.
    -----------------------------
    企业是否有投资信息或购买其他公司股权
    ------
    Empty percentage is 0.02%(3 out of 14862).
    There are 4 status here:['否','无','是','有'], and ['否','无'] should belong to 'No', ['是','有'] belong to 'Yes'.
    Empty value will be mapped to 'Unknown'.
    -----------------------------
    有限责任公司本年度是否发生股东股权转
    ------
    Empty percentage is 0.013%(2 out of 14862).
    There are 4 status here:['否','无','是','有'], and ['否','无'] should belong to 'No', ['是','有'] belong to 'Yes'.
    Empty value will be mapped to 'Unknown'.
    -----------------------------
    是否提供对外担保
    ------
    Empty percentage is 0.075%(11 out of 14862).
    There are 2 status here:['否','是'], we map them to ['No', 'Yes'].
    Empty value will be mapped to 'Unknown'.
    -----------------------------
    发布日期
    ------
    Empty percentage is 0%(0 out of 14862).
    And it's well formatted, so without any process on this column.

    -----------------------------
    年报年份
    ------
    Empty percentage is 0%(0 out of 14862).
    And it's well formatted, so without any process on this column.
    -----------------------------
    :return:
    """
    # EMPTY CHECK
    empty_check_list = [
        u'企业经营状态'.encode('utf-8'), u'从业人数'.encode('utf-8'),
        u'是否有网站或网点'.encode('utf-8'), u'企业是否有投资信息或购买其他公司股权'.encode('utf-8'),
        u'有限责任公司本年度是否发生股东股权转'.encode('utf-8'), u'是否提供对外担保'.encode('utf-8')
    ]
    dcu.drop_rows_too_many_empty(u'年报-企业基本信息.xlsx',
                                 columns=empty_check_list,
                                 thresh=3)

    # LIST OUT VALUES AFTER EMPTY ROWS HANDLED
    panaly.list_category_columns_values([u'年报-企业基本信息'],
                                        u'年报-企业基本信息_empty_handled',
                                        file_url=clean_data_temp_file_url)

    # COLUMNS HANDLE
    # 注册资本
    dcu.drop_columns(u'年报-企业基本信息', [u'注册资本'.encode('utf-8')])

    # 企业经营状态
    status_normal = [u'开业', u'开业/正常经营', u'正常开业']
    status_unnormal = [u'停业', u'其他', u'存续', u'歇业', u'清算']
    status_list = [status_normal, status_unnormal]
    status_after = [u'正常经营', u'非正常经营', u'Unknown']
    dcu.merge_status(u'年报-企业基本信息', u'企业经营状态'.encode('utf-8'), status_list,
                     status_after)

    # 从业人数
    dcu.drop_unit(u'年报-企业基本信息',
                  u'从业人数'.encode('utf-8'), [u'人', u' 人'],
                  empty_mask=-1)

    # 是否有网站或网点
    yn_status_n = [u'否', u'无']
    yn_status_y = [u'是', u'有']
    yn_status_list = [yn_status_n, yn_status_y]
    yn_status_after = ['No', 'Yes']

    dcu.merge_status(u'年报-企业基本信息', u'是否有网站或网点'.encode('utf-8'), yn_status_list,
                     yn_status_after)

    # 企业是否有投资信息或购买其他公司股权
    dcu.merge_status(u'年报-企业基本信息', u'企业是否有投资信息或购买其他公司股权'.encode('utf-8'),
                     yn_status_list, yn_status_after)

    # 有限责任公司本年度是否发生股东股权转
    dcu.merge_status(u'年报-企业基本信息', u'有限责任公司本年度是否发生股东股权转'.encode('utf-8'),
                     yn_status_list, yn_status_after)

    # 是否提供对外担保
    dcu.merge_status(u'年报-企业基本信息', u'是否提供对外担保'.encode('utf-8'), yn_status_list,
                     yn_status_after)

    # 发布日期

    # 年报年份

    return

Exemplo n.º 16

0

Exibir arquivo

def empty_value_handle_social_security_info():
    """
    Dirty value handle for table 年报-社保信息.xlsx.
    First we'll drop rows that empty value is too many.
    ['单位参加城镇职工基本养老保险累计欠缴金额','单位参加城镇职工基本养老保险缴费基数','单位参加失业保险累计欠缴金额',
    '单位参加失业保险缴费基数', '单位参加工伤保险累计欠缴金额','单位参加工伤保险缴费基数','单位参加生育保险缴费基数',
    '参加城镇职工基本养老保险本期实际缴费金额','工伤保险人数']
    Once there are more than 3 empties in these 9 columns we will drop that row.
    Then we check nulls column by column and decide how to process with it.
    Next we should numeric all the value for future process.
    After these are done, it's time to work out features we can use in this table which belongs
        to exploratory data analysis.

    -----------------------------
    城镇职工基本养老保险人数
    ------
    Empty percentage is 0.1265%(7 out of 5532). We mark them as -1.
    Other values are well formatted with end '人', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    失业保险人数
    ------
    Empty percentage is 0.0904%(5 out of 5532). We mark them as -1.
    Other values are well formatted with end '人', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    职工基本医疗保险人数
    ------
    Empty percentage is 0.1085%(6 out of 5532). We mark them as -1.
    Other values are well formatted with end '人', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    工伤保险人数
    ------
    Empty percentage is 0.0904%(5 out of 5532). We mark them as -1.
    Other values are well formatted with end '人', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    生育保险人数
    ------
    Empty percentage is 0.1085%(6 out of 5532). We mark them as -1.
    Other values are well formatted with end '人', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    单位参加城镇职工基本养老保险缴费基数
    ------
    Empty percentage is 4.3745%(242 out of 5532). We mark them as -1. There is 592 is '企业选择不公示', and 325 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1.

    -----------------------------
    单位参加失业保险缴费基数
    ------
    Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 592 is '企业选择不公示', and 325 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1.

    -----------------------------
    单位参加职工基本医疗保险缴费基数
    ------
    Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 592 is '企业选择不公示', and 325 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1.

    -----------------------------
    单位参加工伤保险缴费基数
    ------
    Empty percentage is 96.9631%(5364 out of 5532). We need to drop this column.

    -----------------------------
    单位参加生育保险缴费基数
    ------
    Empty percentage is 0.0723%(4 out of 5532). We mark them as -1. There is 593 is '企业选择不公示', and 325 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1.

    -----------------------------
    参加城镇职工基本养老保险本期实际缴费金额
    ------
    Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(239) and we think them as missing, so they
    belong to -1.


    -----------------------------
    参加失业保险本期实际缴费金额
    ------
    Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(239) and we think them as missing, so they
    belong to -1.

    -----------------------------
    参加职工基本医疗保险本期实际缴费金额
    ------
    Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1.

    -----------------------------
    参加工伤保险本期实际缴费金额
    ------
    Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(313) and we think them as missing, so they
    belong to -1.

    -----------------------------
    参加生育保险本期实际缴费金额
    ------
    Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1.

    -----------------------------
    单位参加城镇职工基本养老保险累计欠缴金额
    ------
    Empty percentage is 0%(0 out of 5532). There is 596 is '企业选择不公示', and 324 '选择不公示',
    we merge them into 'NP'. Also there is one valued with minus number, we just remove the minus.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(240) and we think them as missing, so they
    belong to -1.

    -----------------------------
    单位参加失业保险累计欠缴金额
    ------
    Empty percentage is 0%(0 out of 5532). There is 596 is '企业选择不公示', and 324 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1. Also there is one valued with minus number, we just remove the minus.

    -----------------------------
    单位参加职工基本医疗保险累计欠缴金额
    ------
    Empty percentage is 0%(0 out of 5532). There is 596 is '企业选择不公示', and 324 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1. Also there is one valued with minus number, we just remove the minus.

    -----------------------------
    单位参加工伤保险累计欠缴金额
    ------
    Empty percentage is 0%(0 out of 5532). There is 600 is '企业选择不公示', and 324 '选择不公示',
    we merge them into 'NP'. Also there is one valued with minus number, we just remove the minus.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1. Also there is one valued with minus number, we just remove the minus.

    -----------------------------
    单位参加生育保险累计欠缴金额
    ------
    Empty percentage is 0%(0 out of 5532). There is 596 is '企业选择不公示', and 324 '选择不公示',
    we merge them into 'NP'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they
    belong to -1. Also there is one valued with minus number, we just remove the minus.

    -----------------------------
    年报年份
    ------
    Empty percentage is 0%(0 out of 5532).
    This is well formatted.

    -----------------------------
    :return:
    """
    empty_check_list = [
        u'单位参加城镇职工基本养老保险累计欠缴金额'.encode('utf-8'),
        u'单位参加城镇职工基本养老保险缴费基数'.encode('utf-8'),
        u'单位参加失业保险累计欠缴金额'.encode('utf-8'), u'单位参加失业保险缴费基数'.encode('utf-8'),
        u'单位参加工伤保险累计欠缴金额'.encode('utf-8'), u'单位参加工伤保险缴费基数'.encode('utf-8'),
        u'单位参加生育保险缴费基数'.encode('utf-8'), u'城镇职工基本养老保险人数'.encode('utf-8'),
        u'失业保险人数'.encode('utf-8'), u'参加失业保险本期实际缴费金额'.encode('utf-8'),
        u'参加工伤保险本期实际缴费金额'.encode('utf-8'),
        u'参加城镇职工基本养老保险本期实际缴费金额'.encode('utf-8'), u'工伤保险人数'.encode('utf-8')
    ]
    dcu.drop_rows_too_many_empty(u'年报-社保信息.xlsx',
                                 columns=empty_check_list,
                                 thresh=3)
    panaly.list_category_columns_values([u'年报-社保信息'],
                                        u'年报-社保信息_empty_handled',
                                        file_url=clean_data_temp_file_url)

    dcu.drop_columns(u'年报-社保信息', [u'单位参加工伤保险缴费基数'.encode('utf-8')])

    status_np = [u'企业选择不公示', u'选择不公示']
    status_list = [status_np]
    status_after = ['NP']

    file_people_list = [
        u'城镇职工基本养老保险人数'.encode('utf-8'), u'失业保险人数'.encode('utf-8'),
        u'职工基本医疗保险人数'.encode('utf-8'), u'工伤保险人数'.encode('utf-8'),
        u'生育保险人数'.encode('utf-8')
    ]
    file_cash_list = [
        u'单位参加城镇职工基本养老保险缴费基数'.encode('utf-8'), u'单位参加失业保险缴费基数'.encode('utf-8'),
        u'单位参加职工基本医疗保险缴费基数'.encode('utf-8'), u'单位参加生育保险缴费基数'.encode('utf-8'),
        u'参加城镇职工基本养老保险本期实际缴费金额'.encode('utf-8'),
        u'参加失业保险本期实际缴费金额'.encode('utf-8'),
        u'参加职工基本医疗保险本期实际缴费金额'.encode('utf-8'),
        u'参加工伤保险本期实际缴费金额'.encode('utf-8'), u'参加生育保险本期实际缴费金额'.encode('utf-8'),
        u'单位参加城镇职工基本养老保险累计欠缴金额'.encode('utf-8'),
        u'单位参加失业保险累计欠缴金额'.encode('utf-8'),
        u'单位参加职工基本医疗保险累计欠缴金额'.encode('utf-8'),
        u'单位参加工伤保险累计欠缴金额'.encode('utf-8'), u'单位参加生育保险累计欠缴金额'.encode('utf-8')
    ]

    for column in file_people_list:
        dcu.merge_status(u'年报-社保信息', column, status_list, status_after)
        dcu.drop_unit(u'年报-社保信息', column, [u'人', u' 人'], empty_mask=-1)

    for column in file_cash_list:
        dcu.merge_status(u'年报-社保信息', column, status_list, status_after)
        dcu.drop_unit_remove_minus(u'年报-社保信息',
                                   column, [u'万元', u' 万元'],
                                   empty_mask=-1)

    return

Exemplo n.º 17

0

Exibir arquivo

def empty_value_handle_out_warrant_info():
    """
    Dirty value handle for table 年报-的对外提供保证担保信息.xlsx.
    First we'll drop rows that empty value is too many.
    ['主债权数额','主债权种类','保证的方式']
    Once there are more than 3 empties in these 3 columns we will drop that row.
    Then we check nulls column by column and decide how to process with it.
    Next we should numeric all the value for future process.
    After these are done, it's time to work out features we can use in this table which belongs
        to exploratory data analysis.

    -----------------------------
    主债权数额
    ------
    Empty percentage is 0%(0 out of 6893).
    Other values are formatted with end '万元' or pure numbers, but there's some have blank between number and unit,
    we just drop the unit and clear the blanks.

    -----------------------------
    保证担保的范围
    ------
    Empty percentage is 91.47%(6305 out of 6893). We need to drop it.

    -----------------------------
    保证的期间
    ------
    Empty percentage is 0.0435%(3 out of 6893). We just make them the same with '企业选择不公示'.
    Other values mainly '期限'(6348 out of 6893), and we merge '期间','期限','限期' into one('期限'), also there's a few
    listed as time periods, we merge them into '期限' too. The other value is '未约定'.

    -----------------------------
    保证的方式
    ------
    Empty percentage is 0%(0 out of 6893).
    There are 6 values: ['0', '6', '一般保证', '企业选择不公示', '未约定', '连带保证'], cause '0','6','未约定' counts too
    small(59,1,38 separately), we merge them into 'Others'.

    -----------------------------
    主债权种类
    ------
    Empty percentage is 0%(0 out of 6893).
    There are 3 values: ['企业选择不公示', '其他', '合同'].

    -----------------------------
    履行债务的期限
    ------
    Empty percentage is 0.0145%(1 out of 6893).
    Mainly time periods, but the format is not uniformed, some are like '2018年03月24日-2020年11月24日',
    some '2018年03月24日-', some '2017年8月7日-2018年8月6日', some '2015-01-07至2016-01-07', some '2014-04-04~2016-04-04',
    some '-2018年09月29日' and 6 '-'s, also some are '期限' or '企业选择不公示'. We first format all the time periods into
    '2014/4/4~2016/4/4' so we can handle it properly later.

    -----------------------------
    年报年份
    ------
    Empty percentage is 0%(0 out of 6893).
    They are properly formatted.

    -----------------------------
    :return:
    """
    empty_check_list = [
        u'主债权数额'.encode('utf-8'), u'主债权种类'.encode('utf-8'),
        u'保证的方式'.encode('utf-8')
    ]
    dcu.drop_rows_too_many_empty(u'年报-的对外提供保证担保信息.xlsx',
                                 columns=empty_check_list,
                                 thresh=3)
    panaly.list_category_columns_values([u'年报-的对外提供保证担保信息'],
                                        u'年报-的对外提供保证担保信息_empty_handled',
                                        file_url=clean_data_temp_file_url)

    # 保证担保的范围
    dcu.drop_columns(u'年报-的对外提供保证担保信息', [u'保证担保的范围'.encode('utf-8')])

    # 主债权数额
    dcu.drop_unit(u'年报-的对外提供保证担保信息', u'主债权数额'.encode('utf-8'), [u'万元', u' 万元'])

    # 保证的期间
    status_period = [u'期间', u'期限', u'限期']
    status_list = [status_period]
    status_after = [u'期间']
    dcu.merge_status(u'年报-的对外提供保证担保信息', u'保证的期间'.encode('utf-8'), status_list,
                     status_after)

    # 保证的方式
    status_period = ['0', '6', u'未约定']
    status_list = [status_period]
    status_after = [u'Others']
    dcu.merge_status(u'年报-的对外提供保证担保信息', u'保证的方式'.encode('utf-8'), status_list,
                     status_after)

    # 履行债务的期限
    dcu.time_periods_format(u'年报-的对外提供保证担保信息', u'履行债务的期限'.encode('utf-8'))

    return

Exemplo n.º 18

0

Exibir arquivo

def empty_value_handle_assets_info():
    """
    Dirty value handle for table 年报-企业资产状况信息.xlsx.
    First we'll drop rows that empty value is too many.
    ['主营业务收入','净利润','利润总额','所有者权益合计', '纳税总额','营业总收入','负债总额','资产总额']
    Once there are more than 3 empties in these 8 columns we will drop that row.
    Then we check nulls column by column and decide how to process with it.
    Next we should numeric all the value for future process.
    After these are done, it's time to work out features we can use in this table which belongs
        to exploratory data analysis.

    -----------------------------
    资产总额
    ------
    Empty percentage is 0%(0 out of 14657). But there is 11064 is '企业选择不公示'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    所有者权益合计
    ------
    Empty percentage is 0%(0 out of 14657). But there is 11235 is '企业选择不公示'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    营业总收入
    ------
    Empty percentage is 0%(0 out of 14657). But there is 11344 is '企业选择不公示'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    利润总额
    ------
    Empty percentage is 0%(0 out of 14657). But there is 11304 is '企业选择不公示'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    主营业务收入
    ------
    Empty percentage is 0%(0 out of 14657). But there is 11529 is '企业选择不公示'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    主营业务
    ------
    This value is very complicated with 80% empty(11745 out of 14657). But we think it's somehow important.
    We extract some key words:[u'农', u'土地', u'电器', u'光纤', u'电器', u'化学', u'医疗', u'药', u'信息', u'钢', u'乳',
                u'互联网', u'电机', u'自动化', u'交通', u'汽车', u'投资', u'园区', u'房地产', u'有线', u'日用', u'服饰',
                u'矿', u'开采', u'国有', u'酒', u'银行', u'金融', u'证券', u'航空', u'航天', u'采掘', u'发电', u'工程',
                u'制造'](the sequence is ordered to match the first), Others are into 'Others'. Empty values are
    replaced with 'Unknown'.

    -----------------------------
    净利润
    ------
    Empty percentage is 0%(0 out of 14657). But there is 11292 is '企业选择不公示'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    纳税总额
    ------
    Empty percentage is 0%(0 out of 14657). But there is 11292 is '企业选择不公示'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    负债总额
    ------
    Empty percentage is 0%(0 out of 14657). But there is 11160 is '企业选择不公示'.
    Other values are well formatted with end '万元', but there's some have blank between number and unit, we just
    drop the unit and clear the blanks.

    -----------------------------
    实际员工数量
    ------
    Empty percentage is 91%(13353 out of 14657). We just drop it.

    -----------------------------
    年报年份
    ------
    Empty percentage is 0%(0 out of 14657).
    There's no need to handle the empty.

    -----------------------------
    :return:
    """
    # EMPTY CHECK
    empty_check_list = [
        u'主营业务收入'.encode('utf-8'), u'净利润'.encode('utf-8'),
        u'利润总额'.encode('utf-8'), u'所有者权益合计'.encode('utf-8'),
        u'纳税总额'.encode('utf-8'), u'营业总收入'.encode('utf-8'),
        u'负债总额'.encode('utf-8'), u'资产总额'.encode('utf-8')
    ]
    dcu.drop_rows_too_many_empty(u'年报-企业资产状况信息.xlsx',
                                 columns=empty_check_list,
                                 thresh=3)

    # LIST OUT VALUES AFTER EMPTY ROWS HANDLED
    panaly.list_category_columns_values([u'年报-企业资产状况信息'],
                                        u'年报-企业资产状况信息_empty_handled',
                                        file_url=clean_data_temp_file_url)

    # COLUMNS HANDLE
    # 资产总额
    dcu.drop_unit(u'年报-企业资产状况信息', u'资产总额'.encode('utf-8'), [u'万元', u' 万元'])

    # 所有者权益合计
    dcu.drop_unit(u'年报-企业资产状况信息', u'所有者权益合计'.encode('utf-8'), [u'万元', u' 万元'])

    # 营业总收入
    dcu.drop_unit(u'年报-企业资产状况信息', u'营业总收入'.encode('utf-8'), [u'万元', u' 万元'])

    # 利润总额
    dcu.drop_unit(u'年报-企业资产状况信息', u'利润总额'.encode('utf-8'), [u'万元', u' 万元'])

    # 主营业务收入
    dcu.drop_unit(u'年报-企业资产状况信息', u'主营业务收入'.encode('utf-8'), [u'万元', u' 万元'])

    # 主营业务
    keywords = [
        u'农', u'土地', u'电器', u'光纤', u'电器', u'化学', u'医疗', u'药', u'信息', u'钢',
        u'乳', u'互联网', u'电机', u'自动化', u'交通', u'汽车', u'投资', u'园区', u'房地产', u'有线',
        u'日用', u'服饰', u'矿', u'开采', u'国有', u'酒', u'银行', u'金融', u'证券', u'航空',
        u'航天', u'采掘', u'发电', u'工程', u'制造'
    ]
    dcu.extract_keyword(u'年报-企业资产状况信息', u'主营业务'.encode('utf-8'), keywords)

    # 净利润
    dcu.drop_unit(u'年报-企业资产状况信息', u'净利润'.encode('utf-8'), [u'万元', u' 万元'])

    # 纳税总额
    dcu.drop_unit(u'年报-企业资产状况信息', u'纳税总额'.encode('utf-8'), [u'万元', u' 万元'])

    # 负债总额
    dcu.drop_unit(u'年报-企业资产状况信息', u'负债总额'.encode('utf-8'), [u'万元', u' 万元'])

    # 实际员工数量
    dcu.drop_columns(u'年报-企业资产状况信息', [u'实际员工数量'.encode('utf-8')])

    # 年报年份

    return

Exemplo n.º 19

0

Exibir arquivo

def data_clean_landing_purchase_fdcdqygdqk():
    """
        Dirty value handle for table 购地-房地产大企业购地情况.xlsx.
        First we'll drop rows that empty value is too many.
        # ['行政区','签订日期','供地总面积（公顷）','约定动工时间', '土地用途','供应方式','最小容积率','最大容积率',
        '成交价款（万元）','约定竣工时间']

        -----------------------------
        行政区
        ------
        drop
        -----------------------------
        签订日期
        ------
        no change
        -----------------------------
        供地总面积（公顷）
        ------
        turn null into 'Unknown'
        -----------------------------
        约定动工时间
        ------
        95% null, turn null into 'Unknown'
        -----------------------------
        土地用途
        ------
        turn into several integers
        -----------------------------
        供应方式
        ------
        95% null, turn null into 'Unknown',turn into several integers
        -----------------------------
        最小容积率
        ------
        98% null, turn null into 'Unknown'
        -----------------------------
        最大容积率
        ------
        98% null, turn null into 'Unknown'
        -----------------------------
        成交价款（万元）
        ------
        turn null into 'Unknown'
        -----------------------------
        约定竣工时间
        ------
        98% null, turn null into 'Unknown'
        -----------------------------
        """
    file_name = u'购地-房地产大企业购地情况'
    dcu.drop_columns(file_name, u'行政区'.encode('utf-8'))  # 删除行政区
    dcu.drop_columns(file_name, u'供应方式'.encode('utf-8'))
    dcu.drop_columns(file_name, u'成交价款（万元）'.encode('utf-8'))
    dcu.drop_columns(file_name, u'最小容积率'.encode('utf-8'))
    dcu.drop_columns(file_name, u'最大容积率'.encode('utf-8'))
    dcu.drop_columns(file_name, u'约定动工时间'.encode('utf-8'))
    dcu.drop_columns(file_name, u'约定竣工时间'.encode('utf-8'))

    # 日期：需要去掉小数部分
    time_rearranged(file_name, u'签订日期'.encode('utf-8'), i = 0)
    time_split(file_name, 'time0', i = 0)

    land_usage(file_name, u'土地用途'.encode('utf-8'))  # 土地用途分类

    dcu.merge_status(file_name, u'供地总面积（公顷）'.encode('utf-8'), [], [],empty_mask='0')
    return

Exemplo n.º 20

0

Exibir arquivo

def data_clean_finance_zcfzb():
    """
        Dirty value handle for table 上市信息财务信息资产负债表.xlsx.
        First we'll drop rows that empty value is too many.
       ['企业总评分','标题','日期','资产:货币资金(元)','资产:应收账款(元)','资产:其它应收款(元)','资产:存货(元)','资产:流动资产合计(元)','资产:长期股权投资(元)','资产:累计折旧(元)','资产:固定资产(元)','资产:无形资产(元)','资产:资产总计(元)','负债:应付账款(元)','负债:预收账款(元)','负债:存货跌价准备(元)','负债:流动负债合计(元)','负债:长期负债合计(元)','负债:负债合计(元)','权益:实收资本(或股本)(元)','权益:资本公积金(元)','权益:盈余公积金(元)','权益:股东权益合计(元)','流动比率']
    
    
        -----------------------------
        标题
        ------
        drop this column
        -----------------------------
        日期
        ------
        no change
        -----------------------------
        资产:货币资金(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        资产:应收账款(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        资产:其它应收款(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        资产:存货(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        资产:流动资产合计(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        资产:长期股权投资(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        资产:累计折旧(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        资产:固定资产(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        资产:无形资产(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        资产:资产总计(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        负债:应付账款(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        负债:预收账款(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        负债:存货跌价准备(元)
        ------
        turn '--' into 'NA'
        turn null into 'NA'
        if end with u'万'
            drop u'万'
            *10^4
        if end with u'亿'
            drop u'亿'
            *10^8
    
        -----------------------------
        负债:流动负债合计(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        负债:长期负债合计(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        负债:负债合计(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        权益:实收资本(或股本)(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        权益:资本公积金(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        权益:盈余公积金(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        权益:股东权益合计(元)
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        if end with u'亿' or u'亿'
            ='NA'
        -----------------------------
        流动比率
        ------
        turn null into 'NA'
        turn '--' into 'NA'
        -----------------------------
    """
    dcu.drop_columns(u'上市信息财务信息资产负债表', u'标题'.encode('utf-8'))
    dcu.adjust_time(u'上市信息财务信息资产负债表', u'日期'.encode('utf-8'))

    status_normal = [u'--']  # 搜索满足这个条件的
    status_list = [status_normal]
    status_after = ['Unknown']  # 改成这个

    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:货币资金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:应收账款(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:其它应收款(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:存货(元)', status_list, status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:流动资产合计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:长期股权投资(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:累计折旧(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:固定资产(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:无形资产(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'资产:资产总计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'负债:应付账款(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'负债:预收账款(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'负债:存货跌价准备(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'负债:流动负债合计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'负债:长期负债合计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'负债:负债合计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'权益:实收资本(或股本)(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'权益:资本公积金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'权益:盈余公积金(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'权益:股东权益合计(元)', status_list,
                     status_after)
    dcu.merge_status(u'上市信息财务信息资产负债表', u'流动比率', status_list, status_after)

    dcu.change_number(u'上市信息财务信息资产负债表', u'资产:累计折旧(元)')
    dcu.change_number(u'上市信息财务信息资产负债表', u'负债:存货跌价准备(元)')

    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'资产:货币资金(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'资产:应收账款(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'资产:其它应收款(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'资产:存货(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'资产:流动资产合计(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'资产:长期股权投资(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'资产:固定资产(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'资产:无形资产(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'资产:资产总计(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'负债:应付账款(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'负债:预收账款(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'负债:流动负债合计(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'负债:长期负债合计(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'负债:负债合计(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'权益:实收资本(或股本)(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'权益:资本公积金(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'权益:盈余公积金(元)')
    dcu.merge_number_with_c(u'上市信息财务信息资产负债表', u'权益:股东权益合计(元)')

    return

Exemplo n.º 21

0

Exibir arquivo

def empty_value_handle_share_holder_info():
    """
        Dirty value handle for table 年报-股东（发起人）及出资信息_rearranged.xlsx.
    First we'll drop rows that empty value is too many.
    ['实缴出资额（万元）','实缴出资方式','实缴出资日期','认缴出资方式', '认缴出资日期','认缴出资额（万元）']
    Once there are more than 3 empties in these 8 columns we will drop that row.
    Then we check nulls column by column and decide how to process with it.
    Next we should numeric all the value for future process.
    After these are done, it's time to work out features we can use in this table which belongs
        to exploratory data analysis.

    -----------------------------
    股东类型
    ------
    Empty percentage is 95.8587%(76547 out of 79854). We need to drop it.

    -----------------------------
    股东所占比例
    ------
    Empty percentage is 98.7815263%(78881 out of 79854). We need to drop it.

    -----------------------------
    认缴出资方式
    ------
    Empty percentage is 2.3418%(1870 out of 79854). We replace them with -1.
    It's too complicate, we just count the item values here(may named as '认缴出资方式种类数'). So we just separate them
    with [',', '、'], to do this, we should drop the ',' or '、' at the end first.

    -----------------------------
    认缴出资额（万元）
    ------
    Empty percentage is 0.0288%(23 out of 79854). We just replace them with -1.
    We need to drop the unit ['万', '万元', '万元人民币', '万人民币'], and update ['万美元'] with the number multiplied
    by 6.7.

    -----------------------------
    认缴出资日期
    ------
    Empty percentage is 1.7344%(1385 out of 79854). We replace them by '1000-01-01'
    They are all formatted with format yyyy-mm-dd.
    But there are some are greater than 2019-03-01, we think they are invalid, so replace them as the same as empty.

    -----------------------------
    实缴出资方式
    ------
    Empty percentage is 5.9484%(4750 out of 79854). We replace them with -1.
    It's too complicate, we just count the item values here(may named as '认缴出资方式种类数'). So we just separate them
    with [',', '、', '，'], to do this, we should drop the ',' or '、' or '，' at the end first.

    -----------------------------
    实缴出资额（万元）
    ------
    Empty percentage is 3.2284%(2578 out of 79854). We just replace them with -1.
    We need to drop the unit ['万', '万元', '万元人民币', '万人民币'], and update ['万美元'] with the number multiplied
    by 6.7.

    -----------------------------
    实缴出资日期
    ------
    Empty percentage is 5.2558%(4197 out of 79854). We replace them by '1000-01-01'
    They are all formatted with format yyyy-mm-dd.
    But there are some are greater than 2019-03-01, we think they are invalid, so replace them as the same as empty.

    -----------------------------
    年报年份
    ------
    Empty percentage is 0.05009%(40 out of 79854). We replace them by '1000'

    -----------------------------
    :return:
    """
    empty_check_list = [
        u'实缴出资方式'.encode('utf-8'), u'实缴出资日期'.encode('utf-8'),
        u'实缴出资额（万元）'.encode('utf-8'), u'认缴出资方式'.encode('utf-8'),
        u'认缴出资日期'.encode('utf-8'), u'认缴出资额（万元）'.encode('utf-8')
    ]
    dcu.drop_rows_too_many_empty(u'年报-股东（发起人）及出资信息_rearranged.xlsx',
                                 columns=empty_check_list,
                                 thresh=2)
    panaly.list_category_columns_values(
        [u'年报-股东（发起人）及出资信息_rearranged'],
        u'年报-股东（发起人）及出资信息_rearranged_empty_handled',
        file_url=clean_data_temp_file_url)

    dcu.drop_columns(u'年报-股东（发起人）及出资信息_rearranged',
                     [u'股东类型'.encode('utf-8'), u'股东所占比例'.encode('utf-8')])

    df = file_utils.read_file_to_df(clean_data_temp_file_url,
                                    u'年报-股东（发起人）及出资信息_rearranged')
    values = {
        u'认缴出资方式'.encode('utf-8'): -1,
        u'实缴出资方式'.encode('utf-8'): -1,
        u'认缴出资日期'.encode('utf-8'): '1000-01-01',
        u'实缴出资日期'.encode('utf-8'): '1000-01-01',
        u'认缴出资额（万元）'.encode('utf-8'): -1,
        u'实缴出资额（万元）'.encode('utf-8'): -1,
        u'年报年份'.encode('utf-8'): '1000'
    }
    df = df.fillna(values)
    file_utils.write_file(df, clean_data_temp_file_url,
                          u'年报-股东（发起人）及出资信息_rearranged')

    # 认缴出资方式
    # 实缴出资方式
    splits = [',', u'、', u'，']
    dcu.drop_unit(u'年报-股东（发起人）及出资信息_rearranged',
                  u'认缴出资方式'.encode('utf-8'),
                  splits,
                  empty_mask=-1)
    dcu.drop_unit(u'年报-股东（发起人）及出资信息_rearranged',
                  u'实缴出资方式'.encode('utf-8'),
                  splits,
                  empty_mask=-1)

    dcu.count_split(u'年报-股东（发起人）及出资信息_rearranged',
                    u'认缴出资方式'.encode('utf-8'),
                    splits,
                    empty_mask=-1)
    dcu.count_split(u'年报-股东（发起人）及出资信息_rearranged',
                    u'实缴出资方式'.encode('utf-8'),
                    splits,
                    empty_mask=-1)

    # 认缴出资额（万元）
    # 实缴出资额（万元）
    dcu.drop_unit_with_transfer(u'年报-股东（发起人）及出资信息_rearranged',
                                u'认缴出资额（万元）'.encode('utf-8'),
                                [u'万', u'万元', u'万元人民币', u'万人民币'], {
                                    u'万美元': 6.7,
                                    u'人民币': 0.0001
                                },
                                empty_mask=-1)
    dcu.drop_unit_with_transfer(u'年报-股东（发起人）及出资信息_rearranged',
                                u'实缴出资额（万元）'.encode('utf-8'),
                                [u'万', u'万元', u'万元人民币', u'万人民币'], {
                                    u'万美元': 6.7,
                                    u'人民币': 0.0001
                                },
                                empty_mask=-1)

    return

Exemplo n.º 22

0

Exibir arquivo

def data_clean_landing_purchase_fdcddkcrqk():
    """
                Dirty value handle for table 购地-房地产大地块出让情况.xlsx.
        First we'll drop rows that empty value is too many.
        ['签订日期','供地总面积','约定动工时间', '土地用途','供应方式','容积率下限','容积率上限',
        '成交价款（万元）','约定竣工时间']

        -----------------------------
        签订日期
        ------
        turn into integer
        -----------------------------
        供地总面积
        ------
        turn null into 'Unknown'
        -----------------------------
        约定动工时间
        ------
        97% null, turn null into 'Unknown'
        -----------------------------
        土地用途
        ------
        turn into several integers
        -----------------------------
        供应方式
        ------
        97% null, turn null into 'Unknown',turn into several integers
        -----------------------------
        容积率下限
        ------
        98% null, turn null into 'Unknown'
        -----------------------------
        容积率上限
        ------
        98% null, turn null into 'Unknown'
        -----------------------------
        成交价款（万元）
        ------
        86% null, turn null into 'Unknown'
        -----------------------------
        约定竣工时间
        ------
        99% null, turn null into 'Unknown'
        -----------------------------
        """

    file_name = u'购地-房地产大地块出让情况'
    dcu.drop_columns(file_name, u'约定动工时间'.encode('utf-8'))
    dcu.drop_columns(file_name, u'约定竣工时间'.encode('utf-8'))
    dcu.drop_columns(file_name, u'供应方式'.encode('utf-8'))
    dcu.drop_columns(file_name, u'成交价款（万元）'.encode('utf-8'))
    dcu.drop_columns(file_name, u'容积率上限'.encode('utf-8'))
    dcu.drop_columns(file_name, u'容积率下限'.encode('utf-8'))

    # 日期：需要去掉小数部分
    time_rearranged(file_name, u'签订日期'.encode('utf-8'), i=0)
    time_split(file_name, 'time0', i=0)

    land_usage(file_name, u'土地用途'.encode('utf-8'))

    dcu.merge_status(file_name, u'供地总面积'.encode('utf-8'), [], [],empty_mask= 0)
    return