def func(*args, **kwargs): start_time = time.time() result = f(*args, **kwargs) end_time = time.time() logger.info('function {} using time : {}'.format( f.__name__, end_time - start_time)) return result
def process_single(self, item_name: str): logger.info(f'processing {item_name}') # 开始前清理变量 debugger.variables.clean() config = self.config if config.preload_tpl: debugger.variables.resultsTemplate.append({ "name": "tplImage", "text": "模板图", "image": { "data": "#{tplImage}" } }) rec_data_dir = os.path.join(config.work_dir, config.recognition_data_dirname) rec_img_dir = os.path.join(config.work_dir, config.recognition_img_dirname) rec_data_path = os.path.join(rec_data_dir, item_name) + '.json' rec_img_path = os.path.join(rec_img_dir, item_name) + '.jpg' # 读取识别结果数据 with open(rec_data_path, mode='r', encoding='utf-8') as f: rec_data = json.load(f) if not rec_data: logger.warning(f'raw_data is not present. path={rec_data_path}') return rec_img = cv2.imread(rec_img_path) if isinstance(rec_data, list): rec_data = [_convert_ai_rec_data_item(item) for item in rec_data] start_time = time.time() structure_result = self.session.process( rec_data, rec_img, class_name=self.config.class_name, primary_class=self.config.primary_class, secondary_class=self.config.secondary_class, ltrb=False) else: # 新rawdata start_time = time.time() rec_img = cv2.imread(rec_img_path) request, rpc_name = _convert_request(rec_data, rec_img, self.config) # 开始结构化 structure_result = self.request_processor.process( request, rpc_name, self.config.preload_tpl, item_name=item_name) process_duration = time.time() - start_time logger.debug(f'耗时{process_duration}') debugger.variables.structuringDuration = process_duration # 收集结构化结果 self._pack_debug_data(structure_result) self._dump_debug_data(item_name)
def norm_match(self, node_items: Dict[str, NodeItem]) -> Tuple[List[NodeItem], List[int]]: out = [] ed_dists = [] for it in node_items.values(): matched, ed = self._text_match(it.cn_text, remove_symbols=True, remove_space=True, ed_thresh=self.ed_thresh) if matched: logger.info(f"bg_item [{self}] match {it} by [norm_match]") out.append(it) ed_dists.append(ed) return out, ed_dists
def parse_template( self, node_items: Dict[int, TpNodeItem], img: np.ndarray, debug_data: DebugData = None, ): """ :param node_items: :param img: ndarray BGR image 在某些环节中可能有重识别的步骤,需要用到原始图片 :return: dict[StructureItem] """ structure_items = {} for fg_item in self.fg_items.values(): fg_item.load_data(node_items) item_result = fg_item.run_parse(img, debug_data=debug_data) if item_result is None: content, scores = "", [0] else: content, scores = item_result si = StructureItem( item_name=fg_item.item_name, show_name=fg_item.show_name, content=content, scores=scores, ) structure_items[fg_item.item_name] = si for region_item in self.region_items.values(): region_item.load_data(node_items) region_item.run_parse(img, structure_items) # 通过structure_items 传入image,防止在后处理阶段可能会使用到图片相关的信息 structure_items = self.tmpl_post_proc(structure_items, self.fg_items, img) # 删除 should_output 为 false 的结构化结果 for fg_item in self.fg_items.values(): if fg_item.item_name not in structure_items: continue if fg_item.should_output is False: logger.info( f"Delete structure item should not output: {fg_item.item_name}" ) del structure_items[fg_item.item_name] return structure_items
def common_matched_rule(node1, node2, node_in_row1: pd.Series, node_in_row2: pd.Series, node_items: Dict[str, NodeItem], rows, fields: Dict): """ 遵循一定的原则,判断对应的两行是否是匹配的 """ row1_fid_set = set(node_in_row1.fid) row2_fid_set = set(node_in_row2.fid) # 原则,两行存在两个以上的位置是"对应的":即存在两个位置的 文本类型是对应的,且bbox 是对应的 matched_fields_count = 0 for fid in row1_fid_set | row2_fid_set: nodes_in_fields_1 = node_in_row1[node_in_row1.fid == fid] nodes_in_fields_2 = node_in_row2[node_in_row2.fid == fid] # 在每个fields 中,至少存在一个match 的组合 matched_text = [] has_matched = False for comb in product(nodes_in_fields_1.iterrows(), nodes_in_fields_2.iterrows()): node_in_f1 = node_items[comb[0][1].uid] node_in_f2 = node_items[comb[1][1].uid] min_height = (node_in_f1.bbox.height + node_in_f2.bbox.height) / 2 align_ratio = min( abs(node_in_f1.bbox.cx - node_in_f2.bbox.cx), abs(node_in_f1.bbox.left - node_in_f2.bbox.left), abs(node_in_f1.bbox.right - node_in_f2.bbox.right)) / min_height if align_ratio < 0.1: has_matched = True matched_text = [node_in_f1.text, node_in_f2.text] break if has_matched: logger.info('row {} , {} matched in {}'.format( node1.text, node2.text, matched_text)) matched_fields_count += 1 logger.info( 'matched_fields_count is {} for node {} and node {} , thres is {}'. format(matched_fields_count, node1.text, node2.text, max(2, node1.num_fid_in_row - 1))) if matched_fields_count >= max(2, node1.num_fid_in_row - 1): return True return False
def filter_redudant_content(self, node_info): useless_row_id = [] for filter_config in self.filterrow_config['filter_content']: # 遍历所有的过滤配置 regex_list = filter_config['regex'] adaptive_fields = filter_config['adaptive_fields'] for rid, rid_content in self.row_content_in_each_fields.copy( ).items(): # 遍历所有的行 for fid, content_info in rid_content.items(): # 遍历这些行涉及到的列 header_type = content_info['header_type'] if header_type not in adaptive_fields: continue row_content_in_field = content_info['content'] # print('debug, ', regex_list, row_content_in_field) useless = False for regex in regex_list: if re.match(regex, row_content_in_field): useless = True break if useless: del self.row_content_in_each_fields[rid] row_order = content_info['row_order'] logger.info( '{} is not useful'.format(row_content_in_field)) useless_row_id.append(row_order) break for rid, rid_content in self.rows.copy().items(): row_content = rid_content.content() # print('debugger', row_content) need_ignore = False for regex in self.filterrow_config['filter_content_in_line']: if re.search(regex, row_content, re.IGNORECASE): need_ignore = True if need_ignore: del self.row_content_in_each_fields[rid] useless_row_id.append(rid_content.order) useless_row_id = set(useless_row_id) if not useless_row_id: return node_info node_info = node_info[~node_info.row_order.isin(useless_row_id)] # TODO: 自适应的去除尾部的内容 return node_info
def h_merge_match(self, node_items: Dict[str, NodeItem]) -> Tuple[List[NodeItem], List[int]]: norm_match_res = self.norm_match(node_items) if len(norm_match_res[0]) != 0: return norm_match_res candidate_node_items = {} candidate_chars_count = 0 for node_item in node_items.values(): if node_item.cn_text: for ic, c in enumerate(node_item.text): if c in self.text: s = node_item.split(ic, ic + 1) candidate_node_items[s.uid] = s candidate_chars_count += 1 # 候选的节点的总长度小于背景的 content 长度,直接返回 if candidate_chars_count < len(self.text): return [], [] line_groups = NodeItemGroup.find_row_lines(candidate_node_items, y_thresh=0.3) out = [] ed_dists = [] for group in line_groups: if len(group.content()) < len(self.text): continue _g = group # 移除大于平均间隔字符 if len(group.node_items) >= 3: avg_space = 0 for i in range(len(group.node_items) - 1): avg_space += (group.node_items[i + 1].bbox.cx - group.node_items[i].bbox.cx) avg_space /= len(group.node_items) __g = NodeItemGroup([group.node_items[0], group.node_items[1]]) for i in range(2, len(group.node_items)): if group.node_items[i].bbox.cx - group.node_items[i - 1].bbox.cx > 2 * avg_space: continue __g.append(group.node_items[i]) if len(__g.node_items) != 0: _g = __g if _g.content() == self.text: new_node = NodeItem(_g.gen_raw_node()) out.append(new_node) ed_dists.append(editdistance.eval(new_node.text, self.text)) logger.info(f"bg_item [{self}] match node_item {new_node} by [h_merge_match]") return out, ed_dists
def group_into_rows(self, node_items): # 如果 angle 是小的,则直接使用简单的流程 logger.info('angle of header is {}'.format(self.angle_of_header)) self.require_angle_from_node_items(node_items) if not self.cfg.LINE_HANDLER.consider_angle: # 大角度采用第二种方法 rows = ParagraphHandler.group_into_rows(node_items) else: rows = ParagraphHandler.group_into_lines( node_items, self.angle_of_header, self.cfg.LINE_HANDLER.angle_merge_thresh) return rows
def setup_grpc_server(port): cfg = MyConfig() grpc_server = grpc.server(futures.ThreadPoolExecutor( max_workers=1), options=[ ('grpc.max_receive_message_length', cfg.grpc_max_message_length.value), ('grpc.max_send_message_length', cfg.grpc_max_message_length.value) ], maximum_concurrent_rpcs=cfg.grpc_max_concurrent.value, ) add_StructuringServicer_to_server(servicer=StructuringServer(), server=grpc_server) add_MetricsServicer_to_server(servicer=MetricsServer(cfg), server=grpc_server) add_HealthServicer_to_server(servicer=HealthServer(), server=grpc_server) grpc_server.add_insecure_port('[::]:%s' % port) grpc_server.start() logger.info('grpc server starts serving at %s' % port) return grpc_server
def extract_info(self, block, header_config: ContentConfig): block_content = '\n'.join(block.content) extract_result = extract_text_from_multiline_text( block_content, start_key_words=header_config.start_key_words, end_key_words=header_config.end_key_words, start_exps=header_config.start_exps, end_exps=header_config.end_exps, start_filter_exps=header_config.start_filter_exps, filter_exps=header_config.filter_exps) if not extract_result[0]: # 如果过滤条件设置的有问题导致所有内容都被过滤,就直接输出原始结果 logger.info('block content {} is filtered to empty !!'.format( block_content)) if extract_result[1]: return True, block_content return False, '' return True, extract_result[0]
def parse_header_requirement(self, fields): """ :param header_group: 表头 :param fields: 各个列信息 :return: 返回header_group 是否包含header requirements 所要求的列,并返回 fields 当中的列id """ fields_in_type_req = { fid: field for fid, field in fields.items() if field.header.head_type == self.header_type } if len(fields_in_type_req) == 0: return False, set() if self.header_regexs is None: logger.info('check header {} by {}'.format([ f.header.key_node.content for f in fields_in_type_req.values() ], self.header_type)) return True, set(fields_in_type_req.keys()) try: regex_check = { fid: field for fid, field in fields_in_type_req.items() if any([ re.match(regex, field.header.key_node.content, re.IGNORECASE) for regex in self.header_regexs ]) } except: print('hello') if len(regex_check) > 0: logger.info('check header {} by {}'.format( [f.header.key_node.content for f in regex_check.values()], self.header_regexs)) return True, set(regex_check.keys()) return False, set()
def upload_to_server(config): result_dir_path = os.path.join(config.work_dir, config.result_dirname) if not os.path.exists(result_dir_path): logger.info('experiment result not exists, abort upload') return filenames = os.listdir(result_dir_path) if len(filenames) == 0: return data = { 'items': {}, 'commonVariables': debugger.commonVariables, } for filename in filenames: item_name, _ = os.path.splitext(filename) with open(os.path.join(result_dir_path, filename), encoding='utf-8') as f: json_data = json.load(f) data['items'][item_name] = json_data # result_json = json.dumps(data, ensure_ascii=False, default=lambda x: x.__dict__) # logger.debug(f'experiment result: {result_json}') StClient(config.debug_server_addr).upload_experiment_result( config.exp_id, data)
def select_useful_headers(self, header_groups): # 目前返回一个list ,避免以后希望输出多行 # 根据最后返回的bbox ,对header_groups 去重 bbox_header_group_map = {} for header_group in header_groups: bbox_info = '_'.join([str(_) for _ in header_group.bbox.rect]) content_info = '_'.join([_.key_node.content for _ in header_group.finded_header]) bbox_header_group_map.update({bbox_info + '_' + content_info: header_group}) # logger for header_group in bbox_header_group_map.values(): content_info = '_|_'.join([_.key_node.content for _ in header_group.finded_header]) logger.info('find possible header with content {}'.format(content_info)) filtered_group: List[HeaderGroup] = list(bbox_header_group_map.values()) # TODO: 如果希望返回多个表头 # clean_group = self.remove_overlap(filtered_group) # return clean_group # 筛选原则1,应该保留尽量多的内容? filtered_group = sorted(filtered_group, key=lambda x: x.evaluation_score, reverse=True) filtered_group = [header_group for header_group in filtered_group if header_group.evaluation_score == filtered_group[0].evaluation_score] if len(filtered_group) == 1: return [filtered_group[0]] else: # 如果这几个的高度差不多,选择最长的那一个 filtered_group = sorted(filtered_group, key=lambda x: x.bbox.width, reverse=True) if (filtered_group[0].bbox.cy - np.mean( [header_group.bbox.cy for header_group in filtered_group[1:]])) < 10: return [filtered_group[0]] else: # 去角度变动最小的 filtered_group = sorted(filtered_group, key=lambda x: x.angle_score) return [filtered_group[0]]
def build_blocks(self, row_id, rows, row_order_id_map, node_info, auto_remove_tail=False): """ :param row_id: 关键行的行号 :param rows: list of row , 记录着这条记录涉及到的row order :param row_order_id_map: row_order 和 row 的关系 :param node_info: :param auto_remove_tail : 对于最后一行设置这个参数为True ,会对最后一行,考虑一些特殊的过滤规则,去除掉表尾部的内容 :return: """ lines_in_field = defaultdict(list) useful_row = [True] * len(rows) if auto_remove_tail and len(rows) >= 2: # 需要自适应的去除一些不需要的信息 # rule1 , 计算rows 之间的间隔,如果存在一个很大的间隔,对后面的内容不考虑 # 拿到每个行的top row_bottom = [ self.rows[row_order_id_map[rid]].bbox.bottom for rid in rows ] row_height = [ self.rows[row_order_id_map[rid]].bbox.height for rid in rows ] row_height_diff = np.diff(row_bottom) > 5 * np.mean(row_height) after_useless = False for idx in range(1, len(rows)): if row_height_diff[idx - 1] == True: after_useless = True if after_useless == True: useful_row[idx] = False for row, is_useful in zip(rows, useful_row): if not is_useful: continue row_info = self.row_content_in_each_fields[row_order_id_map[row]] for fid, field_info in row_info.items(): # header_name = self.fields[fid].header.name lines_in_field[fid].append({ 'line_item': field_info['element_group'], 'line_content': field_info['content'] }) row_info = {} for fid, field_info in lines_in_field.items(): line_content = [line['line_content'] for line in field_info] line_item = [line['line_item'] for line in field_info] header_name = self.fields[fid].header.name header_type = self.fields[fid].header.head_type update = False if header_type in [ self.header_type[htype] for htype in self.cfg.ELEMENT_HANDLER.get( 'block_update_config', []) ]: logger.info('set update True for {}'.format(header_type)) update = True row_info[fid] = Block(fid, row_id, header_name, header_type, line_content, line_item, update=update) return row_info
def filter_redudant_line(self, start_filter_line, node_info, possible_key_row=None): # 从行的角度筛选数据 ignore_bg_lines = [] for idx, (bg_texts, ed_thresh) in enumerate( self.filterrow_config['filter_lines']): bg_texts = re.sub('[^0-9A-Za-z]', '', bg_texts).lower() self.filterrow_config['filter_lines'][idx] = (bg_texts, ed_thresh) # 建立row_order 对于rid 的字典 row_order_id_map = { self.rows[rid].order: rid for rid in self.row_content_in_each_fields } # 按照从小到大排序 row_order_id_map = OrderedDict( sorted(row_order_id_map.items(), key=lambda x: x[0])) after_filter_row = False # 在一个过滤行之后的所有内容, 会会被过滤掉 for order, rid in row_order_id_map.items(): # 遍历每一行 if after_filter_row: ignore_bg_lines.append(order) del self.row_content_in_each_fields[rid] continue row = self.rows[rid] if row.order < start_filter_line: continue row_content = row.content() row_content = re.sub('[^0-9A-Za-z]', '', row_content).lower() logger.info('this print used to check rows need filter: {}'.format( row_content)) filtered_by_line_rule = False # print('debug',row_content) for bg_texts, ed_thresh in self.filterrow_config['filter_lines']: dist = ed.eval(row_content, bg_texts) if dist < ed_thresh: del self.row_content_in_each_fields[rid] ignore_bg_lines.append(row.order) after_filter_row = True filtered_by_line_rule = True break if filtered_by_line_rule: # 已经认为是一个需要过滤的行了,这里就不做考虑了 continue for comb in self.filterrow_config['filter_comb']: # 拿到每一个comb 的配置 matched_count = 0 for header_type_list, regex_config in comb: if isinstance(header_type_list, self.header_group.header_types): header_type_list = [header_type_list] at_least_succeed = False for header_type in header_type_list: # 遍历所有的在这次配置当中的header_type if at_least_succeed: break if isinstance(regex_config, list): # 如果对某个内容配置为list regex_list = regex_config # 获取这一行涉及到的这个类型的type content = [ fid_info['content'] for fid, fid_info in self.row_content_in_each_fields[rid].items() if fid_info['header_type'] == header_type ] for regex in regex_list: if any([ re.search(regex, text, re.IGNORECASE) is not None for text in content ]): matched_count += 1 break elif isinstance(regex_config, dict): regex_list = regex_config['content_regex'] header_regex_list = regex_config['header_regex'] content_list = [ (fid, fid_info['content']) for fid, fid_info in self.row_content_in_each_fields[rid].items() if fid_info['header_type'] == header_type ] # 根据fid ,获取每个content 对应的header 的内容 content_list = [(self.fields[fid].header.key_node.content, fid_content) \ for fid, fid_content in content_list] # 从这些content 当中挑选 符合 header_regex_list 的内容 content_satisfy_header_regex = [] for header_content, field_content in content_list: satisfy_regex = False for header_regex in header_regex_list: if re.search(header_regex, header_content, re.IGNORECASE): satisfy_regex = True break if satisfy_regex: content_satisfy_header_regex.append( field_content) if len(content_satisfy_header_regex) == 0: # 说明这一行没有一个列满足header_regex 的条件 continue for regex in regex_list: if any([ re.search(regex, text, re.IGNORECASE) is not None for text in content_satisfy_header_regex ]): matched_count += 1 at_least_succeed = True break if matched_count == len(comb): logger.info('filtered {} by filter_comb'.format( self.rows[rid].content())) del self.row_content_in_each_fields[rid] ignore_bg_lines.append(row.order) after_filter_row = True break node_info = node_info[~node_info.row_order.isin(ignore_bg_lines)] if possible_key_row is not None: possible_key_row = possible_key_row - set(ignore_bg_lines) return node_info, possible_key_row
def filter_nodes_below_headers(self, node_items: Dict[str, NodeItem]): # 返回在表头下方一个区域内的node_items # 如果存在 rbox 信息, 则可以做更多的处理 has_rbox = False for _, value in node_items.items(): if getattr(value, 'rbox'): has_rbox = True if has_rbox: # 获取所有的node_item 的下边界 nodes = self.get_all_nodes() # 找到nodes 的最下方 lowest_head_node = sorted(nodes, key=lambda x: x.rbox.cy, reverse=True)[0] # 获取角度的平均值 # 获取node_items 的角度的均值 # 对长文本统计有效角度 node_item_list = [node for node in node_items.values() if len(node.text) > 4] if len(node_item_list) > 0: meaningfule_angle_list = np.array([node.rbox.meaningful_angle for node in node_item_list]) # 去除角度为0的部分 median_angle = np.median(meaningfule_angle_list) angle_mark_node = node_item_list[np.argmin(np.abs(meaningfule_angle_list - median_angle))].rbox header_line = line_utils.gen_parallel_line(angle_mark_node.up_left[0], angle_mark_node.up_left[1], angle_mark_node.up_right[0], angle_mark_node.up_right[1], lowest_head_node.rbox.down_left[0], lowest_head_node.rbox.down_left[1] ) filtered_nodes = dict() for node in node_items.values(): if header_line.is_under(line_utils.Point(node.rbox.cx, node.rbox.cy)): filtered_nodes[node.uid] = node return filtered_nodes mean_interval = self.mean_header_interval mean_width = self.mean_header_width xmin_limit = self.bbox.left - (mean_interval + mean_width) xmax_limit = self.bbox.right + (mean_interval + mean_width) head_nodes = [set(node.uid for node in header.key_node.node_items) for header in self.finded_header] head_nodes = set.union(*head_nodes) min_bottom = min([header.key_node.bbox.bottom for header in self.finded_header]) ymin_limit = min_bottom - 0.2 * np.mean([header.key_node.avg_height for header in self.finded_header]) filtered_nodes = dict() for uid, node in node_items.items(): if uid in head_nodes: continue if node.bbox.top <= ymin_limit: continue if node.bbox.left <= xmin_limit: continue if node.bbox.right >= xmax_limit: continue filtered_nodes[uid] = node logger.info('filtered {} node on the top of header'.format(len(node_items) - len(filtered_nodes))) return filtered_nodes
def prepare_data(config, with_cache=True): if with_cache: if _rec_cache_available(config): logger.info( 'recognition result data and image exists, using cache') return logger.info( 'recognition result data and image not exists, fetching from server' ) st_client = StClient(config.debug_server_addr) raws = st_client.fetch_raw_data_list(config.lab_id, 1, 10000) rec_data_dir = os.path.join(config.work_dir, config.recognition_data_dirname) rec_img_dir = os.path.join(config.work_dir, config.recognition_img_dirname) img_pool_dir = os.path.join(os.path.dirname(config.work_dir), "ocr_structuring_img_pool") os.makedirs(img_pool_dir, exist_ok=True) os.makedirs(rec_data_dir, exist_ok=True) os.makedirs(rec_img_dir, exist_ok=True) def get_data(raw): if not os.path.exists( os.path.join(rec_data_dir, str(raw['id'])) + '.json'): with open(os.path.join(rec_data_dir, str(raw['id'])) + '.json', 'w', encoding='utf-8') as f: json.dump(raw['data'], f, ensure_ascii=False, indent=2) if config.use_img: assert raw['media_id'] is not None if not os.path.exists( os.path.join(img_pool_dir, str(raw['media_id']) + '.jpg')): st_client.download_media( raw['media_id'], os.path.join(img_pool_dir, str(raw['media_id']) + '.jpg')) if not os.path.exists( os.path.join(rec_img_dir, str(raw['id']) + '.jpg')): os.symlink( os.path.join(img_pool_dir, str(raw['media_id']) + '.jpg'), os.path.join(rec_img_dir, str(raw['id']) + '.jpg')) def get_data_by_gt_id(raw): # 保存时使用 gt_id 作为文件名 with open(os.path.join(rec_data_dir, str(raw['gt_id'])) + '.json', 'w', encoding='utf-8') as f: json.dump(raw['data'], f, ensure_ascii=False, indent=2) if config.use_img: assert raw['media_id'] is not None st_client.download_media( raw['media_id'], os.path.join(rec_img_dir, str(raw['gt_id']) + '.jpg')) with ThreadPoolExecutor(3) as executor: for raw in raws['items']: executor.submit(get_data, raw)
def h_split_match( self, node_items: Dict[str, NodeItem], *, sub_seq_max_interval: int = 2, sub_seq_pre_func: Callable = None, ) -> Tuple[List[NodeItem], List[int], List[NodeItem]]: """ :param node_items: :param sub_seq_max_interval: 最长公共子序列的间距 :param sub_seq_pre_func: 在调用 max_sub_seq_order_dp 时,对 it.text 进行预处理 :return: """ splited_ed_dist = [] splited_key_node = [] splited_rest_nodes = [] for it in node_items.values(): if sub_seq_pre_func is None: res, bg_idxes, node_idxes = max_sub_seq_order_dp(self.text, it.text) else: res, bg_idxes, node_idxes = max_sub_seq_order_dp(self.text, sub_seq_pre_func(it.text)) ed_dist = abs(len(res) - len(self.text)) if self.ed_thresh == -1: if res != self.text: continue else: if ed_dist > self.ed_thresh: continue # node 索引的间隔不能超过 1 should_continue = False for i in range(len(node_idxes) - 1): if node_idxes[i + 1] - node_idxes[i] > sub_seq_max_interval: should_continue = True break if should_continue: continue if node_idxes[0] > 2: # split 出来的节点应该要位于字符串开头位置 continue if sub_seq_pre_func is not None: node_idxes = self.align_node_idxes(node_idxes, sub_seq_pre_func(it.text), it.text) start_idx = node_idxes[0] end_idx = node_idxes[-1] + 1 # sub_str_start_idxes = str_util.findall_sub_str_idx(sub_text=self.text, text=it.text) # if len(sub_str_start_idxes) != 1: # continue # start_idx = sub_str_start_idxes[0] # # 假设所有要 split 的背景字比较靠前 # if start_idx >= 3: # continue # end_idx = sub_str_start_idxes[0] + len(self.text) if end_idx > start_idx: new_node = it.split(start_idx, end_idx) if new_node: splited_key_node.append(new_node) splited_ed_dist.append(ed_dist) rest_node = it.split(end_idx, -1) if rest_node: splited_rest_nodes.append(rest_node) norm_match_res, norm_match_ed_dists = self.norm_match(node_items) splited_key_node.extend(norm_match_res) splited_ed_dist.extend(norm_match_ed_dists) if len(splited_key_node) != 0: logger.info(f"bg_item [{self}] match {' '.join(map(str, splited_key_node))} by [split_match]") for node in splited_rest_nodes: node.is_cut = True return splited_key_node, splited_ed_dist, splited_rest_nodes
max_workers=1), options=[ ('grpc.max_receive_message_length', cfg.grpc_max_message_length.value), ('grpc.max_send_message_length', cfg.grpc_max_message_length.value) ], maximum_concurrent_rpcs=cfg.grpc_max_concurrent.value, ) add_StructuringServicer_to_server(servicer=StructuringServer(), server=grpc_server) add_MetricsServicer_to_server(servicer=MetricsServer(cfg), server=grpc_server) add_HealthServicer_to_server(servicer=HealthServer(), server=grpc_server) grpc_server.add_insecure_port('[::]:%s' % port) grpc_server.start() logger.info('grpc server starts serving at %s' % port) return grpc_server if __name__ == '__main__': config = MyConfig() multiprocessing.freeze_support() server = setup_grpc_server(config.grpc_port.value) validator = TimeValidator(datetime(year=2019, month=9, day=24), datetime(year=2029, month=9, day=25)) try: while validator.validate(): time.sleep(60) # 1 分钟 server.stop(0) while True: time.sleep(60 * 60 * 24) # 1 天 except KeyboardInterrupt: server.stop(0) logger.info('grpc server stop serving')
def recheck_rows_v2(self, node_info, row_map, node_items, thresh=2): # 此方法不能解决所有的问题,仅针对品字形问题进行解决 # 基本思路非常简单,对任意相邻的三行进行检查,如果存在第一行,第三行的两个点"紧挨",而第二行有一个节点的高度 # 和这两个紧挨点的中间高度差不多,则认为三行可以合并 if len(row_map) <= 2: return node_info, row_map # 首先对row_map 从上往下进行排序 ordered_row_map = sorted(row_map.items(), key=lambda x: x[1].bbox.top) # 初始化搜索位置 new_row_group = [] iter_mask = [False] * len(ordered_row_map) # 用于记录哪些点已经被遍历过 for i in range(0, len(ordered_row_map)): if i in [len(ordered_row_map) - 1, len(ordered_row_map) - 2]: iter_mask[i] = True new_row_group.append(ordered_row_map[i][1].node_items) # 移动三个节点的位置 if iter_mask[i]: continue up_row, middle_row, down_row = ordered_row_map[i:i + 3] # 目前这个比较是由上往下的 up_nodes = sorted(up_row[1].node_items, key=lambda x: x.bbox.left) down_nodes = sorted(down_row[1].node_items, key=lambda x: x.bbox.left) middle_nodes = sorted(middle_row[1].node_items, key=lambda x: x.bbox.left) benchmark_pair = None for up_node in up_nodes: find_pair = False for down_node in down_nodes: left_to_right = down_node.bbox.left - up_node.bbox.right if left_to_right > 0: break left_align = abs(up_node.bbox.left - down_node.bbox.left) right_align = abs(up_node.bbox.right - down_node.bbox.right) middle_align = abs(up_node.bbox.cx - down_node.bbox.cx) if min(left_align, right_align, middle_align) > np.mean( [up_node.bbox.height, down_node.bbox.height]): continue if abs(down_node.bbox.top - up_node.bbox.bottom) > 0.5 * np.mean( [up_node.bbox.height, down_node.bbox.height]): continue find_pair = True benchmark_pair = (up_node, down_node) logger.info("find 品 shape pair {} , {}".format( up_node.text, down_node.text)) break if find_pair: break # 计算中心位置 if benchmark_pair is None: new_row_group.append(up_nodes) iter_mask[i] = True continue offset = abs(benchmark_pair[0].bbox.bottom - benchmark_pair[1].bbox.top) / 2 center_y = min(benchmark_pair[0].bbox.bottom, benchmark_pair[1].bbox.top) + offset should_group = False for node in middle_nodes: if abs(node.bbox.cy - center_y) < node.bbox.height * 0.5: should_group = True break if should_group: new_group = up_nodes + middle_nodes + down_nodes new_row_group.append(new_group) iter_mask[i:i + 3] = True, True, True else: new_row_group.append(up_nodes) iter_mask[i] = True new_rows = [] for row in new_row_group: new_rows.append(Line(row)) node_info, row_map = self.make_node_info(new_rows) return node_info, row_map
def find_valid_keyrow(self, node_info, fields, rows, header_group): # 在这里寻找合理的keyrow # 完成header_group 的筛选 # 筛选出regex 涉及到的列 used_fid_set = [] for field in self.adaptive_fields: satisfy_status, fid_set = field.parse_header_requirement(fields) if not satisfy_status: continue used_fid_set.append(fid_set) # 获取有效的fid if not used_fid_set: return False, {} selected_fid = set.union(*used_fid_set) if len(selected_fid) == 0: # 说明对于这次的数据,不存在一个列满足这条规则对表头的要求 return False, {} # 首先,去除掉哪些列数不满足条件的行 filtered = node_info # filtered = node_info[node_info.num_fid_in_row >= len(self.adaptive_fields)] # if self.check_empty(filtered): # return False, {} # 选出规则所需要考虑的列的内容 filtered = filtered[filtered.fid.isin(selected_fid)] if self.check_empty(filtered): return False, {} # 对这些列判断字符和字符类型是否满足要求 filtered = filtered[filtered.apply(self.map_func, axis=1)] if self.check_empty(filtered): return False, {} key_row = set(filtered.row_order.unique()) for _, data in filtered.groupby('row_order'): content = '--'.join(data.text.to_list()) logger.info('check row {} by {}'.format(content, self.regexs)) # 利用unexpected 设置的内容,对key_row 进行过滤 after_filter_key_row = [] for krow in key_row: node_in_this_row = node_info[node_info.row_order == krow] matched_unexpected = False for regexes in self.unexpected_content: matched_all = True if len(regexes) > 0 else False for regex in regexes: matched_filter_rule = node_in_this_row[ node_in_this_row.text.map(lambda x: re.search( regex, x, re.IGNORECASE) is not None)] if matched_filter_rule.shape[0] == 0: # 说明没有元素符合这个正则 matched_all = False if matched_all: matched_unexpected = True if matched_unexpected: continue else: after_filter_key_row.append(krow) key_row = set(after_filter_key_row) return True, key_row
opts = parser.parse_args() debugger.enabled = True config = ProcessConfig( class_name=opts.class_name, primary_class=opts.primary_class, secondary_class=opts.secondary_class, use_img=opts.use_img, preload_tpl=opts.preload_tpl, process_count=opts.process_count, debug_server_addr=opts.debug_server_addr, lab_id=opts.lab_id, exp_id=opts.exp_id, raw_data_id=opts.raw_data_id, work_dir=opts.work_dir, ) processor = Processor(config) if config.is_single_debug: # 处理单张 processor.process_single(str(config.raw_data_id)) else: logger.info('preparing debug data') prepare_data(config) logger.info('processing') processor.process() if config.debug_server_addr: logger.info('uploading experiment result...') upload_to_server(config) logger.info('upload experiment result success.')