def empty_value_handle_patent(): """ :return: """ dcu.drop_unit(u'专利', u'授权公告日'.encode('utf-8'), [u'同一申请的已公布的文献号', '-'], empty_mask='1000-01-01') dcu.drop_prefix_unit(u'专利', u'申请日'.encode('utf-8'), [u'公告日:'], empty_mask='1000-01-01') dcu.drop_unit(u'专利', u'申请日'.encode('utf-8'), ['-'], empty_mask='1000-01-01') panaly.list_category_columns_values([u'专利'], u'专利_empty_handled', file_url=clean_data_temp_file_url) return
def clean_bond(): file_name = u'债券信息' dcu.merge_status(file_name, u'债券信用评级'.encode('utf-8'), [], [], empty_mask='Unknown') dcu.merge_status(file_name, u'付息日期'.encode('utf-8'), [], [], empty_mask='00-00') dcu.merge_status(file_name, u'兑付日期'.encode('utf-8'), [], [], empty_mask='0000-00-00') dcu.merge_status(file_name, u'主体信用评级'.encode('utf-8'), [], [], empty_mask='Unknown') # 空值改为Unknown dcu.merge_status(file_name, u'债券品种'.encode('utf-8'),[], [], empty_mask='Unknown') dcu.merge_status(file_name, u'付息方式'.encode('utf-8'), [], [], empty_mask='Unknown') dcu.drop_unit(file_name, u'债券期限'.encode('utf-8'), [u'年'], empty_mask= -1) wr1 = fu.read_file_to_df(clean_data_temp_file_url,file_name, sheet_name='Sheet') wr1 = wr1.fillna({u'纳税人资格'.encode('utf-8'): 'unknown'}) # 对空值进行处理以进行索引 fu.write_file(wr1, clean_data_temp_file_url, file_name, ext='.xlsx', sheet_name='Sheet', index=False) wr1 = fu.read_file_to_df(clean_data_temp_file_url, file_name, sheet_name='Sheet') wr1 = wr1.fillna({u'票面利率(%)'.encode('utf-8'): 'unknown'}) # 对空值进行处理以进行索引 fu.write_file(wr1, clean_data_temp_file_url, file_name, ext='.xlsx', sheet_name='Sheet', index=False) dcu.drop_columns(file_name, u'币种'.encode('utf-8')) dcu.drop_columns(file_name, u'流通场所'.encode('utf-8')) dcu.drop_columns(file_name, u'实际发行总额(亿元)'.encode('utf-8')) ranking_of_bond(file_name, u'债券信用评级'.encode('utf-8')) kind_of_bond(file_name, u'债券品种'.encode('utf-8')) ranking_of_co(file_name, u'主体信用评级'.encode('utf-8')) interest_pay(file_name, u'付息方式'.encode('utf-8')) time_rearranged(file_name, u'发行日期'.encode('utf-8'), i = 0) time_rearranged(file_name, u'兑付日期'.encode('utf-8'), i = 1) return
def data_clean_finance_ylnlzb(): """ Dirty value handle for table 上市信息财务信息盈利能力指标.xlsx. ['企业总评分','标题','日期','加权净资产收益率(%)','摊薄净资产收益率(%)','摊薄总资产收益率(%)','毛利率(%)','净利率(%)','实际税率(%)'] ----------------------------- 标题 ------ drop this column ----------------------------- 日期 ------ no change ----------------------------- 加权净资产收益率(%) ------ turn '--%' into 'NA' ----------------------------- 摊薄净资产收益率(%) ------ turn '--%' into 'NA' ----------------------------- 摊薄总资产收益率(%) ------ turn '--%' into 'NA' ----------------------------- 毛利率(%) ------ turn '--%' into 'NA' if >100% turn into 'ERROR' ----------------------------- 净利率(%) ------ turn '--%' into 'NA' if >100% turn into 'ERROR' ----------------------------- 实际税率(%) ------ turn '--%' into 'NA' if >100% turn into 'ERROR' ----------------------------- """ dcu.drop_columns(u'上市信息财务信息盈利能力指标', u'标题') dcu.adjust_time(u'上市信息财务信息盈利能力指标', u'日期') status_normal = [u'--%'] # 搜索满足这个条件的 status_list = [status_normal] status_after = ['Unknown'] # 改成这个 dcu.merge_status(u'上市信息财务信息盈利能力指标', u'加权净资产收益率(%)', status_list, status_after, empty_mask='Unknown') dcu.merge_status(u'上市信息财务信息盈利能力指标', u'摊薄净资产收益率(%)', status_list, status_after, empty_mask='Unknown') dcu.merge_status(u'上市信息财务信息盈利能力指标', u'摊薄总资产收益率(%)', status_list, status_after, empty_mask='Unknown') dcu.merge_status(u'上市信息财务信息盈利能力指标', u'毛利率(%)', status_list, status_after, empty_mask='Unknown') dcu.merge_status(u'上市信息财务信息盈利能力指标', u'净利率(%)', status_list, status_after, empty_mask='Unknown') dcu.merge_status(u'上市信息财务信息盈利能力指标', u'实际税率(%)', status_list, status_after, empty_mask='Unknown') unit_strs = [u'%'] dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'加权净资产收益率(%)', unit_strs) dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'摊薄净资产收益率(%)', unit_strs) dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'摊薄总资产收益率(%)', unit_strs) dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'毛利率(%)', unit_strs) dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'净利率(%)', unit_strs) dcu.drop_unit(u'上市信息财务信息盈利能力指标', u'实际税率(%)', unit_strs) # 标记不合理的数据 # dcu.mark_invalid_num_data(u'temp', u'a', '>', 100, error_mask='-65535') dcu.mark_invalid_num_data(u'上市信息财务信息盈利能力指标', u'毛利率(%)'.encode('utf-8'), '>', 100, error_mask='-65535') dcu.mark_invalid_num_data(u'上市信息财务信息盈利能力指标', u'净利率(%)'.encode('utf-8'), '>', 100, error_mask='-65535') dcu.mark_invalid_num_data(u'上市信息财务信息盈利能力指标', u'实际税率(%)'.encode('utf-8'), '>', 100, error_mask='-65535') return
def data_clean_finance_cwfxzb(): """ Dirty value handle for table 上市信息财务信息-财务风险指标.xlsx. First we'll drop rows that empty value is too many. # ['企业总评分','标题','日期','资产负债率(%)','流动负债/总负债(%)','流动比率','速动比率'] In this table, we turn all the '--' and nulls into 'NA'. Valid data are all in the form of double(float) ----------------------------- 标题 ------ drop this column ----------------------------- 日期 ------ no change ----------------------------- 资产负债率(%) ------ turn '--%' into 'NA' ----------------------------- 流动负债/总负债(%) ------ turn '--%' into 'NA' ----------------------------- 流动比率 ------ turn '--' into 'NA' ----------------------------- 速动比率 ------ turn '--' into 'NA' ----------------------------- """ dcu.drop_columns(u'上市信息财务信息-财务风险指标', u'标题'.encode('utf-8')) dcu.adjust_time(u'上市信息财务信息-财务风险指标', u'日期'.encode('utf-8')) status_normal = [u'--', u'--%'] # 搜索满足这个条件的 status_list = [status_normal] status_after = ['Unknown'] # 改成这个 dcu.merge_status(u'上市信息财务信息-财务风险指标', u'资产负债率(%)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') dcu.merge_status(u'上市信息财务信息-财务风险指标', u'流动负债/总负债(%)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') dcu.merge_status(u'上市信息财务信息-财务风险指标', u'流动比率'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') dcu.merge_status(u'上市信息财务信息-财务风险指标', u'速动比率'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # 去百分号 # dcu.drop_unit(u'temp', u'a', unit_strs) unit_strs = [u'%'] dcu.drop_unit(u'上市信息财务信息-财务风险指标', u'资产负债率(%)'.encode('utf-8'), unit_strs) dcu.drop_unit(u'上市信息财务信息-财务风险指标', u'流动负债/总负债(%)'.encode('utf-8'), unit_strs) return
def data_clean_finance_cznlzb(): """ Dirty value handle for table 上市信息财务信息-成长能力指标.xlsx. ['企业总评分','标题','日期','营业总收入(元)','毛利润(元)','归属净利润(元)', '扣非净利润(元)','营业总收入同比增长(元)','归属净利润同比增长(元)','扣非净利润同比增长(元)', '营业总收入滚动环比增长(元)','归属净利润滚动环比增长(元)','扣非净利润滚动环比增长(元)'] 后面六个变量单位是% After these are done, it's time to work out features we can use in this table which belongs to exploratory data analysis. ----------------------------- 标题 ------ drop this column ----------------------------- 日期 ------ no change ----------------------------- 营业总收入(元) ------ turn '--' into 'NA' if end with u'万亿' drop u'万亿' *10^12 if end with u'万' drop u'万' *10^4 if end with u'亿' drop u'亿' *10^8 ----------------------------- 毛利润(元) ------ turn '--' into 'NA' if end with u'万' drop u'万' *10^4 if end with u'亿' drop u'亿' *10^8 ----------------------------- 归属净利润(元) ------ turn '--' into 'NA' if end with u'万' drop u'万' *10^4 if end with u'亿' drop u'亿' *10^8 ----------------------------- 扣非净利润(元) ------ turn '--' into 'NA' if end with u'万' drop u'万' *10^4 if end with u'亿' drop u'亿' *10^8 ----------------------------- 营业总收入同比增长(元)(%) ------ turn '--%' into 'NA' ----------------------------- 归属净利润同比增长(元)(%) ------ turn '--%' into 'NA' ----------------------------- 扣非净利润同比增长(元)(%) ------ turn '--%' into 'NA' ----------------------------- 营业总收入滚动环比增长(元)(%) ------ turn '--%' into 'NA' ----------------------------- 归属净利润滚动环比增长(元)(%) ------ turn '--%' into 'NA' ----------------------------- 扣非净利润滚动环比增长(元)(%) ------ turn '--%' into 'NA' ----------------------------- dcu.change_number(u'temp',u'a',empty_mask='Unknown') """ # dcu.drop_columns(u'上市信息财务信息-成长能力指标', u'标题'.encode('utf-8')) # dcu.adjust_time(u'上市信息财务信息-成长能力指标', u'日期'.encode('utf-8')) # # status_normal = [u'--', u'--%'] # 搜索满足这个条件的 # status_list = [status_normal] # status_after = ['Unknown'] # 改成这个 # # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'营业总收入(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'毛利润(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'归属净利润(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'扣非净利润(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'营业总收入同比增长(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'归属净利润同比增长(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'扣非净利润同比增长(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'营业总收入滚动环比增长(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'归属净利润滚动环比增长(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.merge_status(u'上市信息财务信息-成长能力指标', u'扣非净利润滚动环比增长(元)'.encode('utf-8'), status_list, status_after, empty_mask='Unknown') # dcu.change_number(u'上市信息财务信息-成长能力指标', u'营业总收入(元)'.encode('utf-8')) dcu.change_number(u'上市信息财务信息-成长能力指标', u'毛利润(元)'.encode('utf-8')) dcu.change_number(u'上市信息财务信息-成长能力指标', u'归属净利润(元)'.encode('utf-8')) dcu.change_number(u'上市信息财务信息-成长能力指标', u'扣非净利润(元)'.encode('utf-8')) unit_strs = [u'%'] dcu.drop_unit(u'上市信息财务信息-成长能力指标', u'营业总收入同比增长(元)'.encode('utf-8'), unit_strs) dcu.drop_unit(u'上市信息财务信息-成长能力指标', u'归属净利润同比增长(元)'.encode('utf-8'), unit_strs) dcu.drop_unit(u'上市信息财务信息-成长能力指标', u'扣非净利润同比增长(元)'.encode('utf-8'), unit_strs) dcu.drop_unit(u'上市信息财务信息-成长能力指标', u'营业总收入滚动环比增长(元)'.encode('utf-8'), unit_strs) dcu.drop_unit(u'上市信息财务信息-成长能力指标', u'归属净利润滚动环比增长(元)'.encode('utf-8'), unit_strs) dcu.drop_unit(u'上市信息财务信息-成长能力指标', u'扣非净利润滚动环比增长(元)'.encode('utf-8'), unit_strs) return
def empty_value_handle_share_holder_info(): """ Dirty value handle for table 年报-股东(发起人)及出资信息_rearranged.xlsx. First we'll drop rows that empty value is too many. ['实缴出资额(万元)','实缴出资方式','实缴出资日期','认缴出资方式', '认缴出资日期','认缴出资额(万元)'] Once there are more than 3 empties in these 8 columns we will drop that row. Then we check nulls column by column and decide how to process with it. Next we should numeric all the value for future process. After these are done, it's time to work out features we can use in this table which belongs to exploratory data analysis. ----------------------------- 股东类型 ------ Empty percentage is 95.8587%(76547 out of 79854). We need to drop it. ----------------------------- 股东所占比例 ------ Empty percentage is 98.7815263%(78881 out of 79854). We need to drop it. ----------------------------- 认缴出资方式 ------ Empty percentage is 2.3418%(1870 out of 79854). We replace them with -1. It's too complicate, we just count the item values here(may named as '认缴出资方式种类数'). So we just separate them with [',', '、'], to do this, we should drop the ',' or '、' at the end first. ----------------------------- 认缴出资额(万元) ------ Empty percentage is 0.0288%(23 out of 79854). We just replace them with -1. We need to drop the unit ['万', '万元', '万元人民币', '万人民币'], and update ['万美元'] with the number multiplied by 6.7. ----------------------------- 认缴出资日期 ------ Empty percentage is 1.7344%(1385 out of 79854). We replace them by '1000-01-01' They are all formatted with format yyyy-mm-dd. But there are some are greater than 2019-03-01, we think they are invalid, so replace them as the same as empty. ----------------------------- 实缴出资方式 ------ Empty percentage is 5.9484%(4750 out of 79854). We replace them with -1. It's too complicate, we just count the item values here(may named as '认缴出资方式种类数'). So we just separate them with [',', '、', ','], to do this, we should drop the ',' or '、' or ',' at the end first. ----------------------------- 实缴出资额(万元) ------ Empty percentage is 3.2284%(2578 out of 79854). We just replace them with -1. We need to drop the unit ['万', '万元', '万元人民币', '万人民币'], and update ['万美元'] with the number multiplied by 6.7. ----------------------------- 实缴出资日期 ------ Empty percentage is 5.2558%(4197 out of 79854). We replace them by '1000-01-01' They are all formatted with format yyyy-mm-dd. But there are some are greater than 2019-03-01, we think they are invalid, so replace them as the same as empty. ----------------------------- 年报年份 ------ Empty percentage is 0.05009%(40 out of 79854). We replace them by '1000' ----------------------------- :return: """ empty_check_list = [ u'实缴出资方式'.encode('utf-8'), u'实缴出资日期'.encode('utf-8'), u'实缴出资额(万元)'.encode('utf-8'), u'认缴出资方式'.encode('utf-8'), u'认缴出资日期'.encode('utf-8'), u'认缴出资额(万元)'.encode('utf-8') ] dcu.drop_rows_too_many_empty(u'年报-股东(发起人)及出资信息_rearranged.xlsx', columns=empty_check_list, thresh=2) panaly.list_category_columns_values( [u'年报-股东(发起人)及出资信息_rearranged'], u'年报-股东(发起人)及出资信息_rearranged_empty_handled', file_url=clean_data_temp_file_url) dcu.drop_columns(u'年报-股东(发起人)及出资信息_rearranged', [u'股东类型'.encode('utf-8'), u'股东所占比例'.encode('utf-8')]) df = file_utils.read_file_to_df(clean_data_temp_file_url, u'年报-股东(发起人)及出资信息_rearranged') values = { u'认缴出资方式'.encode('utf-8'): -1, u'实缴出资方式'.encode('utf-8'): -1, u'认缴出资日期'.encode('utf-8'): '1000-01-01', u'实缴出资日期'.encode('utf-8'): '1000-01-01', u'认缴出资额(万元)'.encode('utf-8'): -1, u'实缴出资额(万元)'.encode('utf-8'): -1, u'年报年份'.encode('utf-8'): '1000' } df = df.fillna(values) file_utils.write_file(df, clean_data_temp_file_url, u'年报-股东(发起人)及出资信息_rearranged') # 认缴出资方式 # 实缴出资方式 splits = [',', u'、', u','] dcu.drop_unit(u'年报-股东(发起人)及出资信息_rearranged', u'认缴出资方式'.encode('utf-8'), splits, empty_mask=-1) dcu.drop_unit(u'年报-股东(发起人)及出资信息_rearranged', u'实缴出资方式'.encode('utf-8'), splits, empty_mask=-1) dcu.count_split(u'年报-股东(发起人)及出资信息_rearranged', u'认缴出资方式'.encode('utf-8'), splits, empty_mask=-1) dcu.count_split(u'年报-股东(发起人)及出资信息_rearranged', u'实缴出资方式'.encode('utf-8'), splits, empty_mask=-1) # 认缴出资额(万元) # 实缴出资额(万元) dcu.drop_unit_with_transfer(u'年报-股东(发起人)及出资信息_rearranged', u'认缴出资额(万元)'.encode('utf-8'), [u'万', u'万元', u'万元人民币', u'万人民币'], { u'万美元': 6.7, u'人民币': 0.0001 }, empty_mask=-1) dcu.drop_unit_with_transfer(u'年报-股东(发起人)及出资信息_rearranged', u'实缴出资额(万元)'.encode('utf-8'), [u'万', u'万元', u'万元人民币', u'万人民币'], { u'万美元': 6.7, u'人民币': 0.0001 }, empty_mask=-1) return
def empty_value_handle_basic_info(): """ empty_value handle for table 年报-企业基本信息. Dirty value handle for table 年报-企业基本信息. First we'll drop rows that empty value is too many. ['企业经营状态','从业人数','是否有网站或网点','企业是否有投资信息或购买其他公司股权', '有限责任公司本年度是否发生股东股权转','是否提供对外担保'] Once there are more than 3 empties in these 6 columns we will drop that row. Then we check nulls column by column and decide how to process with it. Next we should numeric all the value for future process. After these are done, it's time to work out features we can use in this table which belongs to exploratory data analysis. ----------------------------- 注册资本 ------ Based on the primary analysis data, we can drop column 注册资本 which empty percentage is 88% ----------------------------- 企业经营状态 ------ Empty percentage is 0%(1 out of 14862). 8 status this value has, they are ['停业','其他','存续','开业','开业/正常经营','歇业','正常开业','清算']. We just add another status for the empty value:'Unknown'. And based on the counts for every status, we simplify these status to ['正常经营','非正常经营','Unknown'] ['开业','开业/正常经营','正常开业'] belongs to '正常经营' and ['停业','其他','存续','歇业','清算'] belongs to '非正常经营'. So we can map these total 9 status to three: {'正常经营':0,'非正常经营':1,'Unknown':-1}. ----------------------------- 从业人数 ------ Empty percentage is 0%(0 out of 14862), and some value end with '人' while some are pure number. But also there are lots of value valued '企业选择不公示'(11623) and a few valued '人' without number. For empty value, we replace with -1 indicating there's no value(be careful here, we don't trigger them as -1 people, -1 here works as a status). Those end with '人', we simply drop '人'. Those valued '企业选择不公示', we replace it as number 0 which also works as a status, there's 8 '0人's in the original value but shouldn't matter. ----------------------------- 是否有网站或网点 ------ Empty percentage is 0%(0 out of 14862). There are 4 status here:['否','无','是','有'], and ['否','无'] should belong to 'No', ['是','有'] belong to 'Yes'. ----------------------------- 企业是否有投资信息或购买其他公司股权 ------ Empty percentage is 0.02%(3 out of 14862). There are 4 status here:['否','无','是','有'], and ['否','无'] should belong to 'No', ['是','有'] belong to 'Yes'. Empty value will be mapped to 'Unknown'. ----------------------------- 有限责任公司本年度是否发生股东股权转 ------ Empty percentage is 0.013%(2 out of 14862). There are 4 status here:['否','无','是','有'], and ['否','无'] should belong to 'No', ['是','有'] belong to 'Yes'. Empty value will be mapped to 'Unknown'. ----------------------------- 是否提供对外担保 ------ Empty percentage is 0.075%(11 out of 14862). There are 2 status here:['否','是'], we map them to ['No', 'Yes']. Empty value will be mapped to 'Unknown'. ----------------------------- 发布日期 ------ Empty percentage is 0%(0 out of 14862). And it's well formatted, so without any process on this column. ----------------------------- 年报年份 ------ Empty percentage is 0%(0 out of 14862). And it's well formatted, so without any process on this column. ----------------------------- :return: """ # EMPTY CHECK empty_check_list = [ u'企业经营状态'.encode('utf-8'), u'从业人数'.encode('utf-8'), u'是否有网站或网点'.encode('utf-8'), u'企业是否有投资信息或购买其他公司股权'.encode('utf-8'), u'有限责任公司本年度是否发生股东股权转'.encode('utf-8'), u'是否提供对外担保'.encode('utf-8') ] dcu.drop_rows_too_many_empty(u'年报-企业基本信息.xlsx', columns=empty_check_list, thresh=3) # LIST OUT VALUES AFTER EMPTY ROWS HANDLED panaly.list_category_columns_values([u'年报-企业基本信息'], u'年报-企业基本信息_empty_handled', file_url=clean_data_temp_file_url) # COLUMNS HANDLE # 注册资本 dcu.drop_columns(u'年报-企业基本信息', [u'注册资本'.encode('utf-8')]) # 企业经营状态 status_normal = [u'开业', u'开业/正常经营', u'正常开业'] status_unnormal = [u'停业', u'其他', u'存续', u'歇业', u'清算'] status_list = [status_normal, status_unnormal] status_after = [u'正常经营', u'非正常经营', u'Unknown'] dcu.merge_status(u'年报-企业基本信息', u'企业经营状态'.encode('utf-8'), status_list, status_after) # 从业人数 dcu.drop_unit(u'年报-企业基本信息', u'从业人数'.encode('utf-8'), [u'人', u' 人'], empty_mask=-1) # 是否有网站或网点 yn_status_n = [u'否', u'无'] yn_status_y = [u'是', u'有'] yn_status_list = [yn_status_n, yn_status_y] yn_status_after = ['No', 'Yes'] dcu.merge_status(u'年报-企业基本信息', u'是否有网站或网点'.encode('utf-8'), yn_status_list, yn_status_after) # 企业是否有投资信息或购买其他公司股权 dcu.merge_status(u'年报-企业基本信息', u'企业是否有投资信息或购买其他公司股权'.encode('utf-8'), yn_status_list, yn_status_after) # 有限责任公司本年度是否发生股东股权转 dcu.merge_status(u'年报-企业基本信息', u'有限责任公司本年度是否发生股东股权转'.encode('utf-8'), yn_status_list, yn_status_after) # 是否提供对外担保 dcu.merge_status(u'年报-企业基本信息', u'是否提供对外担保'.encode('utf-8'), yn_status_list, yn_status_after) # 发布日期 # 年报年份 return
def empty_value_handle_social_security_info(): """ Dirty value handle for table 年报-社保信息.xlsx. First we'll drop rows that empty value is too many. ['单位参加城镇职工基本养老保险累计欠缴金额','单位参加城镇职工基本养老保险缴费基数','单位参加失业保险累计欠缴金额', '单位参加失业保险缴费基数', '单位参加工伤保险累计欠缴金额','单位参加工伤保险缴费基数','单位参加生育保险缴费基数', '参加城镇职工基本养老保险本期实际缴费金额','工伤保险人数'] Once there are more than 3 empties in these 9 columns we will drop that row. Then we check nulls column by column and decide how to process with it. Next we should numeric all the value for future process. After these are done, it's time to work out features we can use in this table which belongs to exploratory data analysis. ----------------------------- 城镇职工基本养老保险人数 ------ Empty percentage is 0.1265%(7 out of 5532). We mark them as -1. Other values are well formatted with end '人', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 失业保险人数 ------ Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. Other values are well formatted with end '人', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 职工基本医疗保险人数 ------ Empty percentage is 0.1085%(6 out of 5532). We mark them as -1. Other values are well formatted with end '人', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 工伤保险人数 ------ Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. Other values are well formatted with end '人', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 生育保险人数 ------ Empty percentage is 0.1085%(6 out of 5532). We mark them as -1. Other values are well formatted with end '人', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 单位参加城镇职工基本养老保险缴费基数 ------ Empty percentage is 4.3745%(242 out of 5532). We mark them as -1. There is 592 is '企业选择不公示', and 325 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. ----------------------------- 单位参加失业保险缴费基数 ------ Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 592 is '企业选择不公示', and 325 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. ----------------------------- 单位参加职工基本医疗保险缴费基数 ------ Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 592 is '企业选择不公示', and 325 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. ----------------------------- 单位参加工伤保险缴费基数 ------ Empty percentage is 96.9631%(5364 out of 5532). We need to drop this column. ----------------------------- 单位参加生育保险缴费基数 ------ Empty percentage is 0.0723%(4 out of 5532). We mark them as -1. There is 593 is '企业选择不公示', and 325 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. ----------------------------- 参加城镇职工基本养老保险本期实际缴费金额 ------ Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(239) and we think them as missing, so they belong to -1. ----------------------------- 参加失业保险本期实际缴费金额 ------ Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(239) and we think them as missing, so they belong to -1. ----------------------------- 参加职工基本医疗保险本期实际缴费金额 ------ Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. ----------------------------- 参加工伤保险本期实际缴费金额 ------ Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(313) and we think them as missing, so they belong to -1. ----------------------------- 参加生育保险本期实际缴费金额 ------ Empty percentage is 0.0904%(5 out of 5532). We mark them as -1. There is 590 is '企业选择不公示', and 325 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. ----------------------------- 单位参加城镇职工基本养老保险累计欠缴金额 ------ Empty percentage is 0%(0 out of 5532). There is 596 is '企业选择不公示', and 324 '选择不公示', we merge them into 'NP'. Also there is one valued with minus number, we just remove the minus. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(240) and we think them as missing, so they belong to -1. ----------------------------- 单位参加失业保险累计欠缴金额 ------ Empty percentage is 0%(0 out of 5532). There is 596 is '企业选择不公示', and 324 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. Also there is one valued with minus number, we just remove the minus. ----------------------------- 单位参加职工基本医疗保险累计欠缴金额 ------ Empty percentage is 0%(0 out of 5532). There is 596 is '企业选择不公示', and 324 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. Also there is one valued with minus number, we just remove the minus. ----------------------------- 单位参加工伤保险累计欠缴金额 ------ Empty percentage is 0%(0 out of 5532). There is 600 is '企业选择不公示', and 324 '选择不公示', we merge them into 'NP'. Also there is one valued with minus number, we just remove the minus. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. Also there is one valued with minus number, we just remove the minus. ----------------------------- 单位参加生育保险累计欠缴金额 ------ Empty percentage is 0%(0 out of 5532). There is 596 is '企业选择不公示', and 324 '选择不公示', we merge them into 'NP'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. Be care we have some valued '万元'(235) and we think them as missing, so they belong to -1. Also there is one valued with minus number, we just remove the minus. ----------------------------- 年报年份 ------ Empty percentage is 0%(0 out of 5532). This is well formatted. ----------------------------- :return: """ empty_check_list = [ u'单位参加城镇职工基本养老保险累计欠缴金额'.encode('utf-8'), u'单位参加城镇职工基本养老保险缴费基数'.encode('utf-8'), u'单位参加失业保险累计欠缴金额'.encode('utf-8'), u'单位参加失业保险缴费基数'.encode('utf-8'), u'单位参加工伤保险累计欠缴金额'.encode('utf-8'), u'单位参加工伤保险缴费基数'.encode('utf-8'), u'单位参加生育保险缴费基数'.encode('utf-8'), u'城镇职工基本养老保险人数'.encode('utf-8'), u'失业保险人数'.encode('utf-8'), u'参加失业保险本期实际缴费金额'.encode('utf-8'), u'参加工伤保险本期实际缴费金额'.encode('utf-8'), u'参加城镇职工基本养老保险本期实际缴费金额'.encode('utf-8'), u'工伤保险人数'.encode('utf-8') ] dcu.drop_rows_too_many_empty(u'年报-社保信息.xlsx', columns=empty_check_list, thresh=3) panaly.list_category_columns_values([u'年报-社保信息'], u'年报-社保信息_empty_handled', file_url=clean_data_temp_file_url) dcu.drop_columns(u'年报-社保信息', [u'单位参加工伤保险缴费基数'.encode('utf-8')]) status_np = [u'企业选择不公示', u'选择不公示'] status_list = [status_np] status_after = ['NP'] file_people_list = [ u'城镇职工基本养老保险人数'.encode('utf-8'), u'失业保险人数'.encode('utf-8'), u'职工基本医疗保险人数'.encode('utf-8'), u'工伤保险人数'.encode('utf-8'), u'生育保险人数'.encode('utf-8') ] file_cash_list = [ u'单位参加城镇职工基本养老保险缴费基数'.encode('utf-8'), u'单位参加失业保险缴费基数'.encode('utf-8'), u'单位参加职工基本医疗保险缴费基数'.encode('utf-8'), u'单位参加生育保险缴费基数'.encode('utf-8'), u'参加城镇职工基本养老保险本期实际缴费金额'.encode('utf-8'), u'参加失业保险本期实际缴费金额'.encode('utf-8'), u'参加职工基本医疗保险本期实际缴费金额'.encode('utf-8'), u'参加工伤保险本期实际缴费金额'.encode('utf-8'), u'参加生育保险本期实际缴费金额'.encode('utf-8'), u'单位参加城镇职工基本养老保险累计欠缴金额'.encode('utf-8'), u'单位参加失业保险累计欠缴金额'.encode('utf-8'), u'单位参加职工基本医疗保险累计欠缴金额'.encode('utf-8'), u'单位参加工伤保险累计欠缴金额'.encode('utf-8'), u'单位参加生育保险累计欠缴金额'.encode('utf-8') ] for column in file_people_list: dcu.merge_status(u'年报-社保信息', column, status_list, status_after) dcu.drop_unit(u'年报-社保信息', column, [u'人', u' 人'], empty_mask=-1) for column in file_cash_list: dcu.merge_status(u'年报-社保信息', column, status_list, status_after) dcu.drop_unit_remove_minus(u'年报-社保信息', column, [u'万元', u' 万元'], empty_mask=-1) return
def empty_value_handle_out_warrant_info(): """ Dirty value handle for table 年报-的对外提供保证担保信息.xlsx. First we'll drop rows that empty value is too many. ['主债权数额','主债权种类','保证的方式'] Once there are more than 3 empties in these 3 columns we will drop that row. Then we check nulls column by column and decide how to process with it. Next we should numeric all the value for future process. After these are done, it's time to work out features we can use in this table which belongs to exploratory data analysis. ----------------------------- 主债权数额 ------ Empty percentage is 0%(0 out of 6893). Other values are formatted with end '万元' or pure numbers, but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 保证担保的范围 ------ Empty percentage is 91.47%(6305 out of 6893). We need to drop it. ----------------------------- 保证的期间 ------ Empty percentage is 0.0435%(3 out of 6893). We just make them the same with '企业选择不公示'. Other values mainly '期限'(6348 out of 6893), and we merge '期间','期限','限期' into one('期限'), also there's a few listed as time periods, we merge them into '期限' too. The other value is '未约定'. ----------------------------- 保证的方式 ------ Empty percentage is 0%(0 out of 6893). There are 6 values: ['0', '6', '一般保证', '企业选择不公示', '未约定', '连带保证'], cause '0','6','未约定' counts too small(59,1,38 separately), we merge them into 'Others'. ----------------------------- 主债权种类 ------ Empty percentage is 0%(0 out of 6893). There are 3 values: ['企业选择不公示', '其他', '合同']. ----------------------------- 履行债务的期限 ------ Empty percentage is 0.0145%(1 out of 6893). Mainly time periods, but the format is not uniformed, some are like '2018年03月24日-2020年11月24日', some '2018年03月24日-', some '2017年8月7日-2018年8月6日', some '2015-01-07至2016-01-07', some '2014-04-04~2016-04-04', some '-2018年09月29日' and 6 '-'s, also some are '期限' or '企业选择不公示'. We first format all the time periods into '2014/4/4~2016/4/4' so we can handle it properly later. ----------------------------- 年报年份 ------ Empty percentage is 0%(0 out of 6893). They are properly formatted. ----------------------------- :return: """ empty_check_list = [ u'主债权数额'.encode('utf-8'), u'主债权种类'.encode('utf-8'), u'保证的方式'.encode('utf-8') ] dcu.drop_rows_too_many_empty(u'年报-的对外提供保证担保信息.xlsx', columns=empty_check_list, thresh=3) panaly.list_category_columns_values([u'年报-的对外提供保证担保信息'], u'年报-的对外提供保证担保信息_empty_handled', file_url=clean_data_temp_file_url) # 保证担保的范围 dcu.drop_columns(u'年报-的对外提供保证担保信息', [u'保证担保的范围'.encode('utf-8')]) # 主债权数额 dcu.drop_unit(u'年报-的对外提供保证担保信息', u'主债权数额'.encode('utf-8'), [u'万元', u' 万元']) # 保证的期间 status_period = [u'期间', u'期限', u'限期'] status_list = [status_period] status_after = [u'期间'] dcu.merge_status(u'年报-的对外提供保证担保信息', u'保证的期间'.encode('utf-8'), status_list, status_after) # 保证的方式 status_period = ['0', '6', u'未约定'] status_list = [status_period] status_after = [u'Others'] dcu.merge_status(u'年报-的对外提供保证担保信息', u'保证的方式'.encode('utf-8'), status_list, status_after) # 履行债务的期限 dcu.time_periods_format(u'年报-的对外提供保证担保信息', u'履行债务的期限'.encode('utf-8')) return
def empty_value_handle_assets_info(): """ Dirty value handle for table 年报-企业资产状况信息.xlsx. First we'll drop rows that empty value is too many. ['主营业务收入','净利润','利润总额','所有者权益合计', '纳税总额','营业总收入','负债总额','资产总额'] Once there are more than 3 empties in these 8 columns we will drop that row. Then we check nulls column by column and decide how to process with it. Next we should numeric all the value for future process. After these are done, it's time to work out features we can use in this table which belongs to exploratory data analysis. ----------------------------- 资产总额 ------ Empty percentage is 0%(0 out of 14657). But there is 11064 is '企业选择不公示'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 所有者权益合计 ------ Empty percentage is 0%(0 out of 14657). But there is 11235 is '企业选择不公示'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 营业总收入 ------ Empty percentage is 0%(0 out of 14657). But there is 11344 is '企业选择不公示'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 利润总额 ------ Empty percentage is 0%(0 out of 14657). But there is 11304 is '企业选择不公示'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 主营业务收入 ------ Empty percentage is 0%(0 out of 14657). But there is 11529 is '企业选择不公示'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 主营业务 ------ This value is very complicated with 80% empty(11745 out of 14657). But we think it's somehow important. We extract some key words:[u'农', u'土地', u'电器', u'光纤', u'电器', u'化学', u'医疗', u'药', u'信息', u'钢', u'乳', u'互联网', u'电机', u'自动化', u'交通', u'汽车', u'投资', u'园区', u'房地产', u'有线', u'日用', u'服饰', u'矿', u'开采', u'国有', u'酒', u'银行', u'金融', u'证券', u'航空', u'航天', u'采掘', u'发电', u'工程', u'制造'](the sequence is ordered to match the first), Others are into 'Others'. Empty values are replaced with 'Unknown'. ----------------------------- 净利润 ------ Empty percentage is 0%(0 out of 14657). But there is 11292 is '企业选择不公示'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 纳税总额 ------ Empty percentage is 0%(0 out of 14657). But there is 11292 is '企业选择不公示'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 负债总额 ------ Empty percentage is 0%(0 out of 14657). But there is 11160 is '企业选择不公示'. Other values are well formatted with end '万元', but there's some have blank between number and unit, we just drop the unit and clear the blanks. ----------------------------- 实际员工数量 ------ Empty percentage is 91%(13353 out of 14657). We just drop it. ----------------------------- 年报年份 ------ Empty percentage is 0%(0 out of 14657). There's no need to handle the empty. ----------------------------- :return: """ # EMPTY CHECK empty_check_list = [ u'主营业务收入'.encode('utf-8'), u'净利润'.encode('utf-8'), u'利润总额'.encode('utf-8'), u'所有者权益合计'.encode('utf-8'), u'纳税总额'.encode('utf-8'), u'营业总收入'.encode('utf-8'), u'负债总额'.encode('utf-8'), u'资产总额'.encode('utf-8') ] dcu.drop_rows_too_many_empty(u'年报-企业资产状况信息.xlsx', columns=empty_check_list, thresh=3) # LIST OUT VALUES AFTER EMPTY ROWS HANDLED panaly.list_category_columns_values([u'年报-企业资产状况信息'], u'年报-企业资产状况信息_empty_handled', file_url=clean_data_temp_file_url) # COLUMNS HANDLE # 资产总额 dcu.drop_unit(u'年报-企业资产状况信息', u'资产总额'.encode('utf-8'), [u'万元', u' 万元']) # 所有者权益合计 dcu.drop_unit(u'年报-企业资产状况信息', u'所有者权益合计'.encode('utf-8'), [u'万元', u' 万元']) # 营业总收入 dcu.drop_unit(u'年报-企业资产状况信息', u'营业总收入'.encode('utf-8'), [u'万元', u' 万元']) # 利润总额 dcu.drop_unit(u'年报-企业资产状况信息', u'利润总额'.encode('utf-8'), [u'万元', u' 万元']) # 主营业务收入 dcu.drop_unit(u'年报-企业资产状况信息', u'主营业务收入'.encode('utf-8'), [u'万元', u' 万元']) # 主营业务 keywords = [ u'农', u'土地', u'电器', u'光纤', u'电器', u'化学', u'医疗', u'药', u'信息', u'钢', u'乳', u'互联网', u'电机', u'自动化', u'交通', u'汽车', u'投资', u'园区', u'房地产', u'有线', u'日用', u'服饰', u'矿', u'开采', u'国有', u'酒', u'银行', u'金融', u'证券', u'航空', u'航天', u'采掘', u'发电', u'工程', u'制造' ] dcu.extract_keyword(u'年报-企业资产状况信息', u'主营业务'.encode('utf-8'), keywords) # 净利润 dcu.drop_unit(u'年报-企业资产状况信息', u'净利润'.encode('utf-8'), [u'万元', u' 万元']) # 纳税总额 dcu.drop_unit(u'年报-企业资产状况信息', u'纳税总额'.encode('utf-8'), [u'万元', u' 万元']) # 负债总额 dcu.drop_unit(u'年报-企业资产状况信息', u'负债总额'.encode('utf-8'), [u'万元', u' 万元']) # 实际员工数量 dcu.drop_columns(u'年报-企业资产状况信息', [u'实际员工数量'.encode('utf-8')]) # 年报年份 return