예제 #1
0
def recognize_instucti(indexcontent):
    df = None
    instructi = []
    unit = '元'
    pattern0 = re.compile('^.*?单位[::](.*?)$')
    for content in indexcontent:
        for classify, item in content.items():
            if classify == 'c' and len(item) > 0:
                for tables in item:
                    for table in tables:
                        if isinstance(table, str):
                            continue
                        df = remove_per_from_df(remove_space_from_df(table))
                        print(df)
                        print('提取的列索引是{}'.format(detect_columns(df)))
                        print('提取的行索引是{}'.format(detect_indexes(df)))
                        instructi.append(df.to_string())
            elif classify == 't' and len(item) > 0:
                if pattern0.match(item):
                    unit = pattern0.match(item).groups()[0]
                else:
                    ret = re.sub('.*?.适用.不适用', '', item)
                    if ret != '':
                        instructi.append(ret)
            else:
                pass

    return df, unit, ''.join(instructi)
예제 #2
0
 def recognize(self):
     dfs = {}
     if self.indexno in ['02050000']:
         for content in self.indexcontent:
             for classify, item in content.items():
                 if classify == 'c' and len(item) > 0:
                     if item[0][0].iloc[:, 0].str.match("会计师事务所名称").any():
                         df1 = remove_space_from_df(item[0][0])
                         dfs['accout'] = df1
                     elif item[0][0].iloc[:, 0].str.match("财务顾问名称").any():
                         df2 = remove_space_from_df(item[0][0])
                         dfs['sponsor'] = df2
                     else:
                         pass
                 else:
                     pass
     else:
         pass
     return dfs
예제 #3
0
def get_dfs(classify, item):
    '''
    从内容中提取多个表格
    :param classify:
    :param item:
    :return:
    '''

    if not isinstance(classify, tuple):
        raise NoTupleException

    dfs = {}
    tables = []
    for table in item:
        if not isinstance(table, str):
            first_df = table[0]
            dfs['first'] = remove_per_from_df(remove_space_from_df(first_df))
            break

    for i in item:
        if isinstance(i, str):
            tables.append(i)
        elif isinstance(i, list):
            tables.append(i)
        else:
            raise Exception

    tables_length = len(tables)
    texts = [i for i in tables if isinstance(i, str)]
    for c in classify:
        for t in texts:
            if c in t:
                c = t
                break

        if (c in tables) and (
            (tables.index(c) + 1) < tables_length) and isinstance(
                tables[tables.index(c) + 1][0], pd.DataFrame):
            dfs[c] = remove_per_from_df(
                remove_space_from_df(tables[tables.index(c) + 1][0]))

    return dfs
예제 #4
0
    def recognize(self):
        dfs = {}
        instructi = []
        unit = '元'
        pattern0 = re.compile('^.*?单位:(.*?)$')
        if self.indexno in ['06030300']:
            for content in self.indexcontent:
                for classify, item in content.items():
                    if classify == 'c' and len(item) > 0:
                        for tables in item:
                            for table in tables:
                                if table.iloc[:,
                                              0].str.contains('主要职业及职务').any():
                                    df = remove_per_from_df(
                                        remove_space_from_df(table))
                                    dfs['np_control'] = df
                                elif table.iloc[:,
                                                0].str.contains('变更日期').any():
                                    df = remove_per_from_df(
                                        remove_space_from_df(table))
                                    dfs['control_change'] = df
                                else:
                                    #增加法人描述
                                    pass
                    elif classify == 't' and len(item) > 0:
                        if pattern0.match(item):
                            unit = pattern0.match(item).groups()[0]
                        else:
                            ret = re.sub('.*?.适用.不适用', '', item)
                            if ret != '':
                                instructi.append(ret)
                    else:
                        pass
        else:
            pass

        return dfs, unit, ''.join(instructi)
예제 #5
0
    def recognize(self):
        dfs = {}
        instructi = []
        unit = '元'
        pattern0 = re.compile('^.*?单位:(.*?)$')
        if self.indexno in ['0603020000']:
            for content in self.indexcontent:
                for classify, item in content.items():
                    if classify == 'c' and len(item) > 0:
                        for tables in item:
                            for table in tables:
                                if table.iloc[0, :].str.contains(
                                        '质押或冻结情况').any():
                                    df = remove_per_from_df(
                                        remove_space_from_df(table))
                                    dfs['topten'] = df
                                elif table.iloc[0, :].str.contains(
                                        '股份种类').any():
                                    df = remove_per_from_df(
                                        remove_space_from_df(table))
                                    dfs['unlimit_sale'] = df
                                else:
                                    pass

                    elif classify == 't' and len(item) > 0:
                        if pattern0.match(item):
                            unit = pattern0.match(item).groups()[0]
                        else:
                            ret = re.sub('.*?.适用.不适用', '', item)
                            if ret != '':
                                instructi.append(ret)
                    else:
                        pass
        else:
            pass

        return dfs, unit, ''.join(instructi)
예제 #6
0
def print_contents(indexno, indexcontents):
    for content in indexcontents:
        for classify, item in content.items():
            if classify == 't':
                print(classify, item)
            elif classify == 'c' and len(item) > 0:
                for tables in item:
                    for table in tables:
                        if isinstance(table, str):
                            continue
                        with transaction.atomic():
                            save_table_attr(table, indexno)
                        print(classify, remove_space_from_df(table))
            else:
                print('未统计')
            print('-------------------')
    print('--------{}结束----------'.format(indexno))
예제 #7
0
def recognize_df_and_instucti(indexcontent):
    dfs = []
    instructi = []
    unit = '元'
    pattern0 = re.compile('^.*?单位[::](.{0,3}元).*?$')
    for k, content in enumerate(indexcontent):
        for classify, item in content.items():
            if classify == 'c' and len(item) > 0:
                if len(item) > 1:
                    item = combine_table_to_first(item)
                for tables in item:
                    for table in tables:
                        df = remove_per_from_df(remove_space_from_df(table))
                        print(df)
                        print('提取的列索引是{}'.format(detect_columns(df)))
                        print('提取的行索引是{}'.format(detect_indexes(df)))
                        dfs.append(df)
            elif classify == 't' and len(item) > 0:
                if pattern0.match(item):
                    unit = pattern0.match(item).groups()[0]
                else:
                    ret = re.sub('.*?.适用.不适用', '', item)
                    if ret != '':
                        instructi.append(ret)
            else:
                pass

    if len(dfs) == 1:
        df = dfs[0]
    elif len(dfs) == 0:
        df = None
    elif len(dfs) > 1:
        raise Exception
        # df = dfs
    else:
        raise Exception

    return df, unit, ''.join(instructi)
예제 #8
0
    def recognize(self):
        indexnos = ['0302000000', '03020100']
        pass

        contents = []
        table = None
        all_content = {}
        if self.indexno in ['0302000000', '03020100']:
            for content in self.indexcontent:
                for classify, item in content.items():
                    if classify == 't' and len(item) > 0:
                        # 逻辑检验:含有公司的中文名称
                        content = re.sub('.适用.不适用', '', item)
                        contents.append(content)
                    elif classify == 'c' and len(item) > 0:
                        if item[0][0].iloc[0, :].str.contains("说明").any():
                            table = remove_space_from_df(item[0][0])
                    else:
                        pass
        else:
            pass
        all_content['content'] = ''.join(contents)
        all_content['table'] = table
        return all_content
예제 #9
0
    if len(set(ret)) == 1 and list(set(ret))[0] == True:
        return True
    else:
        return False


def compare_list_similar(list1, list2, per=0.8):
    sl = similar_list(list1, list2, per)
    if len(sl) == len(list1):
        return True
    else:
        return False


df1 = pd.read_sql('select * from report_data_extract_tableattr',engine)
df1 = remove_space_from_df(df1)
df2 = pd.read_sql('report_data_extract_stdcontentindex',engine)
df2 = remove_space_from_df(df2)
df3 = pd.merge(df1,df2,how='outer',left_on='indexno_id',right_on='id')
df4 = pd.pivot_table(df3,index=['no','name','columns'],values=['no_name'],aggfunc='count')
df5 = pd.pivot_table(df3,index=['no','name','indexes'],values=['no_name'],aggfunc='count')
df4 = df4.reset_index()
df5 = df5.reset_index()
df6 = df4.groupby('no').apply(lambda t: t[t.no_name==t.no_name.max()])
df7 = df5.groupby('no').apply(lambda t: t[t.no_name==t.no_name.max()])
df6 = df6[df6.no_name!=1]
df7 = df7[df7.no_name!=1]
df8 = pd.merge(df6,df7,how='outer',left_on='no',right_on='no')
index_std = []
count = []
for key,(x1,x2) in enumerate(list(zip(df8.no_name_x,df8.no_name_y))):