Пример #1
0
    def classify(self, prev_block: TextBlock, curr_block: TextBlock,
                 next_block: TextBlock):
        if curr_block.link_density <= 0.333333:
            if prev_block.link_density <= 0.555556:
                if curr_block.text_density <= 9:
                    if next_block.text_density <= 10:
                        if prev_block.text_density <= 4:
                            is_content = False
                        else:
                            is_content = True
                    else:
                        is_content = True
                else:
                    if next_block.text_density == 0:
                        is_content = False
                    else:
                        is_content = True
            else:
                if next_block.text_density <= 11:
                    is_content = False
                else:
                    is_content = True
        else:
            is_content = False

        changes = curr_block.is_content is is_content
        curr_block.is_content = is_content

        return changes
Пример #2
0
    def classify(self, prev_block: TextBlock, curr_block: TextBlock,
                 next_block: TextBlock):
        if curr_block.link_density <= 0.333333:
            if prev_block.link_density <= 0.555556:
                if curr_block.num_words <= 16:
                    if next_block.num_words <= 15:
                        if prev_block.num_words <= 4:
                            is_content = False
                        else:
                            is_content = True
                    else:
                        is_content = True
                else:
                    is_content = True
            else:
                if curr_block.num_words <= 40:
                    if next_block.num_words <= 17:
                        is_content = False
                    else:
                        is_content = True
                else:
                    is_content = True
        else:
            is_content = False

        changes = curr_block.is_content is is_content
        curr_block.is_content = is_content

        return changes
Пример #3
0
    def classify(self, prev_block: TextBlock, curr_block: TextBlock,
                 next_block: TextBlock):
        cond1 = curr_block.link_density > 0 and next_block.num_words > 11
        cond2 = curr_block.num_words > 19
        cond3 = next_block.num_words > 6 and next_block.link_density == 0 and prev_block.link_density == 0 and \
                (curr_block.num_words > 6 or prev_block.num_words > 7 or next_block.num_words > 19)
        is_content = cond1 or cond2 or cond3

        changes = curr_block.is_content is is_content
        curr_block.is_content = is_content

        return changes
Пример #4
0
    def process(self, doc: TextDocument) -> bool:
        changes = False
        blocks = doc.text_blocks
        blocks_new = []
        for tb in blocks:
            text = tb.text
            paragraphs = self.NEWLINE_REGEX.split(text)
            if len(paragraphs) < 2:
                blocks_new.append(tb)
                continue
            is_content = tb.is_content
            labels = tb.labels
            for p in paragraphs:
                tb_p = TextBlock(p)
                tb_p.is_content = is_content
                tb_p.add_labels(labels)
                blocks_new.append(tb_p)
                changes = True

        if changes:
            doc.text_blocks = blocks_new
        return changes
Пример #5
0
def make_doc(words_arr,
             num_anchor_words_arr=None,
             is_content_arr=None,
             label_arr=None):
    text_blocks = []
    for idx, words in enumerate(words_arr):
        if isinstance(words, int):
            num_words = words
            text = ' '.join(default_words[:num_words])
        else:
            text = words
            num_words = text.count(' ')
        try:
            num_anchor_words = num_anchor_words_arr[idx]
        except (TypeError, IndexError):
            num_anchor_words = 0
        block = TextBlock(text, set(), num_words, num_anchor_words, 0, 0, idx)
        try:
            block.is_content = is_content_arr[idx]
        except (TypeError, IndexError):
            pass
        try:
            label = label_arr[idx]
            if label is None:
                pass
            elif isinstance(label, list):
                for l in label:
                    block.add_label(l)
            else:
                block.add_label(label)
        except (TypeError, IndexError):
            pass

        text_blocks.append(block)

    return TextDocument(text_blocks)