Пример #1
0
    def process(self, doc: TextDocument) -> bool:
        text_blocks = doc.text_blocks
        if len(text_blocks) < 2:
            return False
        # WARNING: POSSIBLE BUG FOUND: shouldn't prev_block be reset every passthrough?
        changes = False
        # if it has been changed on the previous passthrough
        changed_on_pass = True
        while changed_on_pass:
            changed_on_pass = False
            prev_block = text_blocks[0]
            blocks_to_remove = []
            for block in text_blocks[1:]:
                if prev_block.is_content and block.link_density < 0.56 \
                        and not block.has_label(DefaultLabels.STRICTLY_NOT_CONTENT):
                    prev_block.merge_next(block)
                    blocks_to_remove.append(block)
                    changed_on_pass = True
                    changes = True
                else:
                    prev_block = block
                text_blocks = self.subtract_blocks(text_blocks,
                                                   blocks_to_remove)
        if changes:
            doc.text_blocks = text_blocks

        return changes
Пример #2
0
    def process(self, doc: TextDocument) -> bool:
        text_blocks = doc.text_blocks
        new_blocks = [tb for tb in text_blocks if tb.is_content]
        has_changes = len(new_blocks) < len(text_blocks)
        doc.text_blocks = new_blocks

        return has_changes
Пример #3
0
    def to_text_document(self) -> TextDocument:
        """
        Returns a TextDocument containing the extracted TextBlocks. NOTE: Only call this after parsing.
        
        :return: The TextDocument
        """

        #  just to be sure
        self.flush_block()
        return TextDocument(self.text_blocks, self.title)
Пример #4
0
    def process(self, doc: TextDocument) -> bool:
        text_blocks = doc.text_blocks
        if len(text_blocks) < 2:
            return False
        changes = False

        if self.content_only:
            start_idx = None
            for idx, block in enumerate(text_blocks):
                if block.is_content:
                    start_idx = idx
                    break
            if start_idx is None:
                return False
        else:
            start_idx = 0

        prev_block = text_blocks[start_idx]
        blocks_to_remove = []
        for block in text_blocks[start_idx + 1:]:
            if not block.is_content:
                prev_block = block
                continue
            diff_blocks = (block.offset_blocks_start -
                           prev_block.offset_blocks_end - 1)
            if diff_blocks <= self.max_blocks_distance:
                ok = True
                if self.content_only:
                    if not prev_block.is_content or not block.is_content:
                        ok = False
                if (self.same_tag_level_only
                        and prev_block.tag_level != block.tag_level):
                    ok = False
                if ok:
                    prev_block.merge_next(block)
                    # remove current block
                    blocks_to_remove.append(block)
                    changes = True
                else:
                    prev_block = block
            else:
                prev_block = block

        if len(blocks_to_remove) > 0:
            doc.text_blocks = self.subtract_blocks(text_blocks,
                                                   blocks_to_remove)
            changes = True

        return changes
Пример #5
0
    def process(self, doc: TextDocument) -> bool:
        text_blocks = doc.text_blocks
        if len(text_blocks) < 2:
            return False
        changes = False
        prev_block = text_blocks[0]
        blocks_to_remove = []
        for block in text_blocks[1::]:
            if self.equal_labels(prev_block.labels, block.labels):
                prev_block.merge_next(block)
                blocks_to_remove.append(block)
                changes = True
            else:
                prev_block = block

        if changes:
            doc.text_blocks = self.subtract_blocks(text_blocks,
                                                   blocks_to_remove)

        return changes
Пример #6
0
 def parse_doc(self, input_str: str) -> Union[TextDocument, None]:
     bp_parser = parser.BoilerpipeHTMLParser(
         raise_on_failure=self.raise_on_failure)
     try:
         bp_parser.feed(input_str)
     except:
         # in case of error, try again, first removing script tag content
         bp_parser = parser.BoilerpipeHTMLParser(
             raise_on_failure=self.raise_on_failure)
         input_str = self.SCRIPT_REGEX.sub('<script></script>', input_str)
         try:
             bp_parser.feed(input_str)
         except Exception as ex:
             logger.exception('Error parsing HTML')
             if self.raise_on_failure:
                 raise HTMLExtractionError from ex
             else:
                 return TextDocument([])
     doc = bp_parser.to_text_document()
     return doc
Пример #7
0
    def process(self, doc: TextDocument) -> bool:
        changes = False
        blocks = doc.text_blocks
        blocks_new = []
        for tb in blocks:
            text = tb.text
            paragraphs = self.NEWLINE_REGEX.split(text)
            if len(paragraphs) < 2:
                blocks_new.append(tb)
                continue
            is_content = tb.is_content
            labels = tb.labels
            for p in paragraphs:
                tb_p = TextBlock(p)
                tb_p.is_content = is_content
                tb_p.add_labels(labels)
                blocks_new.append(tb_p)
                changes = True

        if changes:
            doc.text_blocks = blocks_new
        return changes
Пример #8
0
def make_doc(words_arr,
             num_anchor_words_arr=None,
             is_content_arr=None,
             label_arr=None):
    text_blocks = []
    for idx, words in enumerate(words_arr):
        if isinstance(words, int):
            num_words = words
            text = ' '.join(default_words[:num_words])
        else:
            text = words
            num_words = text.count(' ')
        try:
            num_anchor_words = num_anchor_words_arr[idx]
        except (TypeError, IndexError):
            num_anchor_words = 0
        block = TextBlock(text, set(), num_words, num_anchor_words, 0, 0, idx)
        try:
            block.is_content = is_content_arr[idx]
        except (TypeError, IndexError):
            pass
        try:
            label = label_arr[idx]
            if label is None:
                pass
            elif isinstance(label, list):
                for l in label:
                    block.add_label(l)
            else:
                block.add_label(label)
        except (TypeError, IndexError):
            pass

        text_blocks.append(block)

    return TextDocument(text_blocks)