示例#1
0
    def update_children(self, reg1, reg2):
        page_config = Page_Config()

        if reg1['children'] != None and len(reg1['children']) > 0:
            if reg2['children'] != None and len(reg2['children']) > 0:
                agg_children = reg1['children'] + reg2['children']
                agg_children.sort(
                    key=lambda x: x['boundingBox']['vertices'][0]['y'])

                children = sort_regions(agg_children, [])
                if len(children) > 1:
                    avg__region_height, avg__region_ver_dist, avg__region_width = page_config.avg_line_info(
                        [{
                            'children': children
                        }])
                    avrage_region_ver_ratio = avg__region_ver_dist / max(
                        1, avg__region_height)
                    return horzontal_merging(children, avrage_region_ver_ratio)
                    #v_list[idx] =v_block
                else:
                    return children
            else:
                return reg1['children']
        else:
            if reg2['children'] != None and len(reg2['children']) > 0:
                return reg2['children']
            else:
                return []
示例#2
0
def update_children(reg1,reg2):
    if reg1['children']!=None and len(reg1['children']) > 0 :
        if reg2['children']!=None and len(reg2['children']) > 0 :
            agg_children =  reg1['children'] + reg2['children']
            agg_children.sort(key=lambda x: x['boundingBox']['vertices'][0]['y'])

            children = sort_regions(agg_children , [])
            if len(children) > 1 :
                return children #horzontal_merging(children)
                #v_list[idx] =v_block
            else:
                return children
        else :
            return reg1['children']
    else :
        if reg2['children']!=None and len(reg2['children']) > 0 :
            return reg2['children']
        else :
            return []
示例#3
0
    def region_unifier(self,file,page_g_words, page_lines,page_regions,page_c_words,path):
        try:
            
            #sort regions 
            page_lines = add_font(page_lines)
            page_regions  = filterd_regions(page_regions)
            if len(page_regions) > 0 :
                page_regions.sort(key=lambda x:x['boundingBox']['vertices'][0]['y'])
                sorted_page_regions = sort_regions(page_regions,[])

            else:
                sorted_page_regions = page_regions



            page_words = collate_text(file,page_c_words, page_g_words)
            
            text_region,n_text_table_regions,tabel_region,image_region,head_foot_region = self.get_text_tabel_region(sorted_page_regions)
            tabel_region  = remvoe_regions(copy.deepcopy(image_region), copy.deepcopy(tabel_region))
            filtered_words = remvoe_regions(copy.deepcopy(image_region), copy.deepcopy(page_words))
            filtered_lines = remvoe_regions(copy.deepcopy(image_region), copy.deepcopy(page_lines))
            
            t_list = []
            for idx,table in enumerate(tabel_region):
                if 'regions' in table.keys():
                    filtered_words     = remvoe_regions(copy.deepcopy(table['regions']), copy.deepcopy(filtered_words))
                    filtered_lines    = remvoe_regions(copy.deepcopy(table['regions']), copy.deepcopy(filtered_lines))
                    tabel_region[idx]['regions'] =  collate_regions(regions = copy.deepcopy(table['regions']),lines = copy.deepcopy(page_words),child_class='WORD',grand_children=False,region_flag = False)
                    page_words = filtered_words
                    page_lines = filtered_lines
                    t_list.append(tabel_region[idx])
                    

            t_list =  collate_cell_regions(copy.deepcopy(t_list),copy.deepcopy(page_words),child_class='CELL_TEXT',grand_children=True,region_flag = False)
            
            page_words   = remvoe_regions(copy.deepcopy(t_list), copy.deepcopy(page_words))
            filtered_lines   = remvoe_regions(copy.deepcopy(t_list), copy.deepcopy(page_lines))
            filtered_words = copy.deepcopy(page_words)
            text_region  = remvoe_regions(copy.deepcopy(t_list) ,copy.deepcopy(text_region))
            line_list    = collate_regions(copy.deepcopy( filtered_lines), copy.deepcopy( filtered_words),child_class='WORD',add_font=True)

            head_foot_list =  collate_regions(copy.deepcopy(head_foot_region),copy.deepcopy(line_list),child_class='LINE',grand_children=True,region_flag = False)
            filtered_lines  = remvoe_regions(copy.deepcopy(head_foot_list), copy.deepcopy(line_list))
            
            
            

            
            
            v_list       = collate_regions( copy.deepcopy( text_region),copy.deepcopy( filtered_lines ),child_class='LINE' ,grand_children=True,add_font=True )
            i_list       =  collate_regions(copy.deepcopy( image_region),copy.deepcopy(page_words),grand_children=True,region_flag = False,skip_enpty_children=True)
            

            
            page_config                         = Page_Config()
            avg_height, avg_ver_dist, avg_width = page_config.avg_line_info(v_list)

            if avg_height == 0:
                avg_height = 1
            self.avg_ver_ratio =   avg_ver_dist /avg_height
            v_list.extend(head_foot_list)

            for idx,v_block in enumerate(v_list):
                if 'class' in v_list[idx].keys():
                    if v_list[idx]['class'] == 'TEXT':
                        v_list[idx]['class']= "PARA"

                if 'regions' in v_block.keys():
                    if   v_block['regions'] != None and  len(v_block['regions']) > 1 :
                        avg__region_height, avg__region_ver_dist, avg__region_width = page_config.avg_line_info([v_block])
                        v_block['avg_ver_dist'] = avg__region_ver_dist
                        avrage_region_ver_ratio= avg__region_ver_dist / max(1,avg__region_height)
                        #v_block['regions'] = horzontal_merging(v_block['regions'],avrage_region_ver_ratio)
                else:
                    log_info('region key not found for {}  in page {}'.format(v_block, path),app_context.application_context )

                if 'children' in v_block.keys():
                    v_block.pop('children')
                    
                    
                    v_list[idx] =copy.deepcopy(v_block)


            
            for idx,t_block in enumerate(t_list):
                t_list[idx]['class'] = 'TABLE'
                if   t_block['regions'] != None and  len(t_block['regions']) > 1 :
                    avg__region_height, avg__region_ver_dist, avg__region_width = page_config.avg_line_info([t_block])
                    t_block['avg_ver_dist'] = avg__region_ver_dist

                    avrage_region_ver_ratio= avg__region_ver_dist / max(1,avg__region_height)
                    t_list[idx] =copy.deepcopy(t_block)

            
            avg_word_sepc     = page_config.avg_word_sep(line_list)

            v_list.extend(t_list)

            if self.check_double_column(v_list,avg_height):
                print("this document is double columnssssssss")
                return v_list, n_text_table_regions
            flag = False
            while flag==True:
                v_list, flag = self.merge_remove_overlap(v_list,avg_height, avg_ver_dist, avg_width,avg_word_sepc)

        except Exception as e:
            log_exception("Error occured during block unifier",  app_context.application_context, e)
            return None  ,None
        
        return v_list, n_text_table_regions