def update_children(self, reg1, reg2): page_config = Page_Config() if reg1['children'] != None and len(reg1['children']) > 0: if reg2['children'] != None and len(reg2['children']) > 0: agg_children = reg1['children'] + reg2['children'] agg_children.sort( key=lambda x: x['boundingBox']['vertices'][0]['y']) children = sort_regions(agg_children, []) if len(children) > 1: avg__region_height, avg__region_ver_dist, avg__region_width = page_config.avg_line_info( [{ 'children': children }]) avrage_region_ver_ratio = avg__region_ver_dist / max( 1, avg__region_height) return horzontal_merging(children, avrage_region_ver_ratio) #v_list[idx] =v_block else: return children else: return reg1['children'] else: if reg2['children'] != None and len(reg2['children']) > 0: return reg2['children'] else: return []
def update_children(reg1,reg2): if reg1['children']!=None and len(reg1['children']) > 0 : if reg2['children']!=None and len(reg2['children']) > 0 : agg_children = reg1['children'] + reg2['children'] agg_children.sort(key=lambda x: x['boundingBox']['vertices'][0]['y']) children = sort_regions(agg_children , []) if len(children) > 1 : return children #horzontal_merging(children) #v_list[idx] =v_block else: return children else : return reg1['children'] else : if reg2['children']!=None and len(reg2['children']) > 0 : return reg2['children'] else : return []
def region_unifier(self,file,page_g_words, page_lines,page_regions,page_c_words,path): try: #sort regions page_lines = add_font(page_lines) page_regions = filterd_regions(page_regions) if len(page_regions) > 0 : page_regions.sort(key=lambda x:x['boundingBox']['vertices'][0]['y']) sorted_page_regions = sort_regions(page_regions,[]) else: sorted_page_regions = page_regions page_words = collate_text(file,page_c_words, page_g_words) text_region,n_text_table_regions,tabel_region,image_region,head_foot_region = self.get_text_tabel_region(sorted_page_regions) tabel_region = remvoe_regions(copy.deepcopy(image_region), copy.deepcopy(tabel_region)) filtered_words = remvoe_regions(copy.deepcopy(image_region), copy.deepcopy(page_words)) filtered_lines = remvoe_regions(copy.deepcopy(image_region), copy.deepcopy(page_lines)) t_list = [] for idx,table in enumerate(tabel_region): if 'regions' in table.keys(): filtered_words = remvoe_regions(copy.deepcopy(table['regions']), copy.deepcopy(filtered_words)) filtered_lines = remvoe_regions(copy.deepcopy(table['regions']), copy.deepcopy(filtered_lines)) tabel_region[idx]['regions'] = collate_regions(regions = copy.deepcopy(table['regions']),lines = copy.deepcopy(page_words),child_class='WORD',grand_children=False,region_flag = False) page_words = filtered_words page_lines = filtered_lines t_list.append(tabel_region[idx]) t_list = collate_cell_regions(copy.deepcopy(t_list),copy.deepcopy(page_words),child_class='CELL_TEXT',grand_children=True,region_flag = False) page_words = remvoe_regions(copy.deepcopy(t_list), copy.deepcopy(page_words)) filtered_lines = remvoe_regions(copy.deepcopy(t_list), copy.deepcopy(page_lines)) filtered_words = copy.deepcopy(page_words) text_region = remvoe_regions(copy.deepcopy(t_list) ,copy.deepcopy(text_region)) line_list = collate_regions(copy.deepcopy( filtered_lines), copy.deepcopy( filtered_words),child_class='WORD',add_font=True) head_foot_list = collate_regions(copy.deepcopy(head_foot_region),copy.deepcopy(line_list),child_class='LINE',grand_children=True,region_flag = False) filtered_lines = remvoe_regions(copy.deepcopy(head_foot_list), copy.deepcopy(line_list)) v_list = collate_regions( copy.deepcopy( text_region),copy.deepcopy( filtered_lines ),child_class='LINE' ,grand_children=True,add_font=True ) i_list = collate_regions(copy.deepcopy( image_region),copy.deepcopy(page_words),grand_children=True,region_flag = False,skip_enpty_children=True) page_config = Page_Config() avg_height, avg_ver_dist, avg_width = page_config.avg_line_info(v_list) if avg_height == 0: avg_height = 1 self.avg_ver_ratio = avg_ver_dist /avg_height v_list.extend(head_foot_list) for idx,v_block in enumerate(v_list): if 'class' in v_list[idx].keys(): if v_list[idx]['class'] == 'TEXT': v_list[idx]['class']= "PARA" if 'regions' in v_block.keys(): if v_block['regions'] != None and len(v_block['regions']) > 1 : avg__region_height, avg__region_ver_dist, avg__region_width = page_config.avg_line_info([v_block]) v_block['avg_ver_dist'] = avg__region_ver_dist avrage_region_ver_ratio= avg__region_ver_dist / max(1,avg__region_height) #v_block['regions'] = horzontal_merging(v_block['regions'],avrage_region_ver_ratio) else: log_info('region key not found for {} in page {}'.format(v_block, path),app_context.application_context ) if 'children' in v_block.keys(): v_block.pop('children') v_list[idx] =copy.deepcopy(v_block) for idx,t_block in enumerate(t_list): t_list[idx]['class'] = 'TABLE' if t_block['regions'] != None and len(t_block['regions']) > 1 : avg__region_height, avg__region_ver_dist, avg__region_width = page_config.avg_line_info([t_block]) t_block['avg_ver_dist'] = avg__region_ver_dist avrage_region_ver_ratio= avg__region_ver_dist / max(1,avg__region_height) t_list[idx] =copy.deepcopy(t_block) avg_word_sepc = page_config.avg_word_sep(line_list) v_list.extend(t_list) if self.check_double_column(v_list,avg_height): print("this document is double columnssssssss") return v_list, n_text_table_regions flag = False while flag==True: v_list, flag = self.merge_remove_overlap(v_list,avg_height, avg_ver_dist, avg_width,avg_word_sepc) except Exception as e: log_exception("Error occured during block unifier", app_context.application_context, e) return None ,None return v_list, n_text_table_regions