Пример #1
0
    def process(self, doc: TextDocument) -> bool:
        text_blocks = doc.text_blocks
        if len(text_blocks) < 2:
            return False
        # WARNING: POSSIBLE BUG FOUND: shouldn't prev_block be reset every passthrough?
        changes = False
        # if it has been changed on the previous passthrough
        changed_on_pass = True
        while changed_on_pass:
            changed_on_pass = False
            prev_block = text_blocks[0]
            blocks_to_remove = []
            for block in text_blocks[1:]:
                if prev_block.is_content and block.link_density < 0.56 \
                        and not block.has_label(DefaultLabels.STRICTLY_NOT_CONTENT):
                    prev_block.merge_next(block)
                    blocks_to_remove.append(block)
                    changed_on_pass = True
                    changes = True
                else:
                    prev_block = block
                text_blocks = self.subtract_blocks(text_blocks,
                                                   blocks_to_remove)
        if changes:
            doc.text_blocks = text_blocks

        return changes
Пример #2
0
    def process(self, doc: TextDocument) -> bool:
        text_blocks = doc.text_blocks
        new_blocks = [tb for tb in text_blocks if tb.is_content]
        has_changes = len(new_blocks) < len(text_blocks)
        doc.text_blocks = new_blocks

        return has_changes
Пример #3
0
    def process(self, doc: TextDocument) -> bool:
        text_blocks = doc.text_blocks
        if len(text_blocks) < 2:
            return False
        changes = False

        if self.content_only:
            start_idx = None
            for idx, block in enumerate(text_blocks):
                if block.is_content:
                    start_idx = idx
                    break
            if start_idx is None:
                return False
        else:
            start_idx = 0

        prev_block = text_blocks[start_idx]
        blocks_to_remove = []
        for block in text_blocks[start_idx + 1:]:
            if not block.is_content:
                prev_block = block
                continue
            diff_blocks = (block.offset_blocks_start -
                           prev_block.offset_blocks_end - 1)
            if diff_blocks <= self.max_blocks_distance:
                ok = True
                if self.content_only:
                    if not prev_block.is_content or not block.is_content:
                        ok = False
                if (self.same_tag_level_only
                        and prev_block.tag_level != block.tag_level):
                    ok = False
                if ok:
                    prev_block.merge_next(block)
                    # remove current block
                    blocks_to_remove.append(block)
                    changes = True
                else:
                    prev_block = block
            else:
                prev_block = block

        if len(blocks_to_remove) > 0:
            doc.text_blocks = self.subtract_blocks(text_blocks,
                                                   blocks_to_remove)
            changes = True

        return changes
Пример #4
0
    def process(self, doc: TextDocument) -> bool:
        text_blocks = doc.text_blocks
        if len(text_blocks) < 2:
            return False
        changes = False
        prev_block = text_blocks[0]
        blocks_to_remove = []
        for block in text_blocks[1::]:
            if self.equal_labels(prev_block.labels, block.labels):
                prev_block.merge_next(block)
                blocks_to_remove.append(block)
                changes = True
            else:
                prev_block = block

        if changes:
            doc.text_blocks = self.subtract_blocks(text_blocks,
                                                   blocks_to_remove)

        return changes
Пример #5
0
    def process(self, doc: TextDocument) -> bool:
        changes = False
        blocks = doc.text_blocks
        blocks_new = []
        for tb in blocks:
            text = tb.text
            paragraphs = self.NEWLINE_REGEX.split(text)
            if len(paragraphs) < 2:
                blocks_new.append(tb)
                continue
            is_content = tb.is_content
            labels = tb.labels
            for p in paragraphs:
                tb_p = TextBlock(p)
                tb_p.is_content = is_content
                tb_p.add_labels(labels)
                blocks_new.append(tb_p)
                changes = True

        if changes:
            doc.text_blocks = blocks_new
        return changes