def iter_model_data_for_layout_document( # pylint: disable=too-many-locals self, layout_document: LayoutDocument) -> Iterable[LayoutModelData]: relative_font_size_feature = RelativeFontSizeFeature( layout_document.iter_all_tokens()) line_indentation_status_feature = LineIndentationStatusFeature() previous_layout_token: Optional[LayoutToken] = None concatenated_line_tokens_length_by_line_id = { id(line): sum((len(token.text) for token in line.tokens)) for block in layout_document.iter_all_blocks() for line in block.lines } if not concatenated_line_tokens_length_by_line_id: LOGGER.debug('empty layout document') return max_concatenated_line_tokens_length = max( concatenated_line_tokens_length_by_line_id.values()) document_token_count = sum( (1 for _ in layout_document.iter_all_tokens())) document_token_index = 0 for block in layout_document.iter_all_blocks(): block_lines = block.lines line_count = len(block_lines) for line_index, line in enumerate(block_lines): line_indentation_status_feature.on_new_line() line_tokens = line.tokens token_count = len(line_tokens) concatenated_line_tokens_text = ''.join( [token.text for token in line_tokens]) line_token_position = 0 for token_index, token in enumerate(line_tokens): yield from self.iter_model_data_for_context_layout_token_features( ContextAwareLayoutTokenFeatures( token, layout_line=line, previous_layout_token=previous_layout_token, document_features_context=self. document_features_context, token_index=token_index, token_count=token_count, document_token_index=document_token_index, document_token_count=document_token_count, line_index=line_index, line_count=line_count, concatenated_line_tokens_text= concatenated_line_tokens_text, max_concatenated_line_tokens_length= max_concatenated_line_tokens_length, line_token_position=line_token_position, relative_font_size_feature= relative_font_size_feature, line_indentation_status_feature= line_indentation_status_feature)) previous_layout_token = token line_token_position += len(token.text) document_token_index += 1
def iter_line_features( # pylint: disable=too-many-locals self, layout_document: LayoutDocument ) -> Iterable[SegmentationLineFeatures]: segmentation_line_features = SegmentationLineFeatures( document_features_context=self.document_features_context) previous_token: Optional[LayoutToken] = None segmentation_line_features.document_token_count = sum( len(line.tokens) for block in layout_document.iter_all_blocks() for line in block.lines) pattern_candididate_block_iterable = ( block for page in layout_document.pages for block_index, block in enumerate(page.blocks) if block_index < 2 or block_index > len(page.blocks) - 2) pattern_candididate_line_iterable = ( block.lines[0] for block in pattern_candididate_block_iterable if block.lines and block.lines[0].tokens) all_pattern_by_line_id = { id(line): get_text_pattern(line.text) for line in pattern_candididate_line_iterable } LOGGER.debug('all_pattern_by_line_id: %s', all_pattern_by_line_id) pattern_by_line_id = { key: value for key, value in all_pattern_by_line_id.items() if len(value) >= 8 # Java GROBID sometimes counts an additional trailing space } pattern_counter = Counter(pattern_by_line_id.values()) LOGGER.debug('pattern_counter: %s', pattern_counter) seen_repetitive_patterns: Set[str] = set() document_token_index = 0 for page in layout_document.pages: blocks = page.blocks segmentation_line_features.page_blocks = blocks for block_index, block in enumerate(blocks): segmentation_line_features.page_block_index = block_index block_lines = block.lines segmentation_line_features.block_lines = block_lines block_line_texts = [line.text for line in block_lines] max_block_line_text_length = max( len(text) for text in block_line_texts) first_block_token = next(iter(block.iter_all_tokens()), None) assert first_block_token for line_index, line in enumerate(block_lines): segmentation_line_features.document_token_index = document_token_index document_token_index += len(line.tokens) segmentation_line_features.layout_line = line segmentation_line_features.block_line_index = line_index segmentation_line_features.max_block_line_text_length = ( max_block_line_text_length) line_text = block_line_texts[line_index] retokenized_token_texts = re.split(r" |\t|\f|\u00A0", line_text) if not retokenized_token_texts: continue if self.use_first_token_of_block: # Java GROBID uses the first token in the block token = first_block_token else: token = line.tokens[0] segmentation_line_features.layout_token = token segmentation_line_features.line_text = line_text segmentation_line_features.concatenated_line_tokens_text = line_text segmentation_line_features.token_text = retokenized_token_texts[ 0].strip() segmentation_line_features.second_token_text = ( retokenized_token_texts[1] if len(retokenized_token_texts) >= 2 else '') segmentation_line_features.previous_layout_token = previous_token line_pattern = pattern_by_line_id.get(id(line), '') LOGGER.debug('line_pattern: %r', line_pattern) segmentation_line_features.is_repetitive_pattern = ( pattern_counter[line_pattern] > 1) segmentation_line_features.is_first_repetitive_pattern = ( segmentation_line_features.is_repetitive_pattern and line_pattern not in seen_repetitive_patterns) if segmentation_line_features.is_first_repetitive_pattern: seen_repetitive_patterns.add(line_pattern) yield segmentation_line_features previous_token = token