def recognize_instucti(indexcontent): df = None instructi = [] unit = '元' pattern0 = re.compile('^.*?单位[::](.*?)$') for content in indexcontent: for classify, item in content.items(): if classify == 'c' and len(item) > 0: for tables in item: for table in tables: if isinstance(table, str): continue df = remove_per_from_df(remove_space_from_df(table)) print(df) print('提取的列索引是{}'.format(detect_columns(df))) print('提取的行索引是{}'.format(detect_indexes(df))) instructi.append(df.to_string()) elif classify == 't' and len(item) > 0: if pattern0.match(item): unit = pattern0.match(item).groups()[0] else: ret = re.sub('.*?.适用.不适用', '', item) if ret != '': instructi.append(ret) else: pass return df, unit, ''.join(instructi)
def recognize(self): dfs = {} if self.indexno in ['02050000']: for content in self.indexcontent: for classify, item in content.items(): if classify == 'c' and len(item) > 0: if item[0][0].iloc[:, 0].str.match("会计师事务所名称").any(): df1 = remove_space_from_df(item[0][0]) dfs['accout'] = df1 elif item[0][0].iloc[:, 0].str.match("财务顾问名称").any(): df2 = remove_space_from_df(item[0][0]) dfs['sponsor'] = df2 else: pass else: pass else: pass return dfs
def get_dfs(classify, item): ''' 从内容中提取多个表格 :param classify: :param item: :return: ''' if not isinstance(classify, tuple): raise NoTupleException dfs = {} tables = [] for table in item: if not isinstance(table, str): first_df = table[0] dfs['first'] = remove_per_from_df(remove_space_from_df(first_df)) break for i in item: if isinstance(i, str): tables.append(i) elif isinstance(i, list): tables.append(i) else: raise Exception tables_length = len(tables) texts = [i for i in tables if isinstance(i, str)] for c in classify: for t in texts: if c in t: c = t break if (c in tables) and ( (tables.index(c) + 1) < tables_length) and isinstance( tables[tables.index(c) + 1][0], pd.DataFrame): dfs[c] = remove_per_from_df( remove_space_from_df(tables[tables.index(c) + 1][0])) return dfs
def recognize(self): dfs = {} instructi = [] unit = '元' pattern0 = re.compile('^.*?单位:(.*?)$') if self.indexno in ['06030300']: for content in self.indexcontent: for classify, item in content.items(): if classify == 'c' and len(item) > 0: for tables in item: for table in tables: if table.iloc[:, 0].str.contains('主要职业及职务').any(): df = remove_per_from_df( remove_space_from_df(table)) dfs['np_control'] = df elif table.iloc[:, 0].str.contains('变更日期').any(): df = remove_per_from_df( remove_space_from_df(table)) dfs['control_change'] = df else: #增加法人描述 pass elif classify == 't' and len(item) > 0: if pattern0.match(item): unit = pattern0.match(item).groups()[0] else: ret = re.sub('.*?.适用.不适用', '', item) if ret != '': instructi.append(ret) else: pass else: pass return dfs, unit, ''.join(instructi)
def recognize(self): dfs = {} instructi = [] unit = '元' pattern0 = re.compile('^.*?单位:(.*?)$') if self.indexno in ['0603020000']: for content in self.indexcontent: for classify, item in content.items(): if classify == 'c' and len(item) > 0: for tables in item: for table in tables: if table.iloc[0, :].str.contains( '质押或冻结情况').any(): df = remove_per_from_df( remove_space_from_df(table)) dfs['topten'] = df elif table.iloc[0, :].str.contains( '股份种类').any(): df = remove_per_from_df( remove_space_from_df(table)) dfs['unlimit_sale'] = df else: pass elif classify == 't' and len(item) > 0: if pattern0.match(item): unit = pattern0.match(item).groups()[0] else: ret = re.sub('.*?.适用.不适用', '', item) if ret != '': instructi.append(ret) else: pass else: pass return dfs, unit, ''.join(instructi)
def print_contents(indexno, indexcontents): for content in indexcontents: for classify, item in content.items(): if classify == 't': print(classify, item) elif classify == 'c' and len(item) > 0: for tables in item: for table in tables: if isinstance(table, str): continue with transaction.atomic(): save_table_attr(table, indexno) print(classify, remove_space_from_df(table)) else: print('未统计') print('-------------------') print('--------{}结束----------'.format(indexno))
def recognize_df_and_instucti(indexcontent): dfs = [] instructi = [] unit = '元' pattern0 = re.compile('^.*?单位[::](.{0,3}元).*?$') for k, content in enumerate(indexcontent): for classify, item in content.items(): if classify == 'c' and len(item) > 0: if len(item) > 1: item = combine_table_to_first(item) for tables in item: for table in tables: df = remove_per_from_df(remove_space_from_df(table)) print(df) print('提取的列索引是{}'.format(detect_columns(df))) print('提取的行索引是{}'.format(detect_indexes(df))) dfs.append(df) elif classify == 't' and len(item) > 0: if pattern0.match(item): unit = pattern0.match(item).groups()[0] else: ret = re.sub('.*?.适用.不适用', '', item) if ret != '': instructi.append(ret) else: pass if len(dfs) == 1: df = dfs[0] elif len(dfs) == 0: df = None elif len(dfs) > 1: raise Exception # df = dfs else: raise Exception return df, unit, ''.join(instructi)
def recognize(self): indexnos = ['0302000000', '03020100'] pass contents = [] table = None all_content = {} if self.indexno in ['0302000000', '03020100']: for content in self.indexcontent: for classify, item in content.items(): if classify == 't' and len(item) > 0: # 逻辑检验:含有公司的中文名称 content = re.sub('.适用.不适用', '', item) contents.append(content) elif classify == 'c' and len(item) > 0: if item[0][0].iloc[0, :].str.contains("说明").any(): table = remove_space_from_df(item[0][0]) else: pass else: pass all_content['content'] = ''.join(contents) all_content['table'] = table return all_content
if len(set(ret)) == 1 and list(set(ret))[0] == True: return True else: return False def compare_list_similar(list1, list2, per=0.8): sl = similar_list(list1, list2, per) if len(sl) == len(list1): return True else: return False df1 = pd.read_sql('select * from report_data_extract_tableattr',engine) df1 = remove_space_from_df(df1) df2 = pd.read_sql('report_data_extract_stdcontentindex',engine) df2 = remove_space_from_df(df2) df3 = pd.merge(df1,df2,how='outer',left_on='indexno_id',right_on='id') df4 = pd.pivot_table(df3,index=['no','name','columns'],values=['no_name'],aggfunc='count') df5 = pd.pivot_table(df3,index=['no','name','indexes'],values=['no_name'],aggfunc='count') df4 = df4.reset_index() df5 = df5.reset_index() df6 = df4.groupby('no').apply(lambda t: t[t.no_name==t.no_name.max()]) df7 = df5.groupby('no').apply(lambda t: t[t.no_name==t.no_name.max()]) df6 = df6[df6.no_name!=1] df7 = df7[df7.no_name!=1] df8 = pd.merge(df6,df7,how='outer',left_on='no',right_on='no') index_std = [] count = [] for key,(x1,x2) in enumerate(list(zip(df8.no_name_x,df8.no_name_y))):