Пример #1
0
def get_text(html_content,
             display_images=False,
             deduplicate_captions=False,
             display_links=False,
             indentation='extended'):
    '''
    :param html_content: the html string to be converted to text
    :param display_images: whether to display image caption
    :param indentation: either 'standard' (solely based on the css) or 'extended'
        which intends divs and adds spaces between span tags
    '''
    html_content = html_content.strip()
    if not html_content:
        return ''

    if indentation == 'extended':
        css = DEFAULT_CSS.copy()
        css['div'] = HtmlElement('div', display=Display.block, padding=2)
        css['span'] = HtmlElement('span', prefix=' ', suffix=' ')
    else:
        css = DEFAULT_CSS

    # strip XML declaration, if necessary
    if html_content.startswith('<?xml '):
        html_content = RE_STRIP_XML_DECLARATION.sub('', html_content, count=1)

    html_tree = fromstring(html_content)
    parser = Inscriptis(html_tree,
                        display_images=display_images,
                        deduplicate_captions=deduplicate_captions,
                        display_links=display_links,
                        css=css)
    return parser.get_text()
Пример #2
0
    def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False):
        '''
        ::param: display_images \
            whether to include image tiles/alt texts
        ::param: deduplicate_captions \
            whether to deduplicate captions such as image titles
            (many newspaper include images and video previews with
             identifical titles).
        ::param: display_links \
            whether to display link targets (e.g. `[Python](https://www.python.org)`)
        '''
        # setup config
        self.cfg_deduplicate_captions = deduplicate_captions

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self.start_table,
            'tr': self.start_tr,
            'td': self.start_td,
            'th': self.start_td,
            'ul': self.start_ul,
            'ol': self.start_ol,
            'li': self.start_li,
            'br': self.newline,
            'a': self.start_a if display_links else None,
            'img': self.start_img if display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self.end_table,
            'ul': self.end_ul,
            'ol': self.end_ol,
            'td': self.end_td,
            'th': self.end_td,
            'a': self.end_a if display_links else None,
        }

        # instance variables
        self.current_tag = [HtmlElement()]
        self.current_line = [Line()]
        self.next_line = [Line()]

        # the canvases used for displaying text
        # clean_text_line[0] refers to the root canvas; tables write into child
        # canvases that are created for every table line and merged with the
        # root canvas at the end of a table
        self.clean_text_lines = [[]]

        self.current_table = []
        self.li_counter = []
        self.li_level = 0
        self.invisible = []  # a list of attributes that are considered invisible
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self.crawl_tree(html_tree)
        if self.current_line[-1]:
            self.write_line()
Пример #3
0
class Inscriptis(object):

    UL_COUNTER = ('* ', '+ ', 'o ', '- ')
    UL_COUNTER_LEN = len(UL_COUNTER)

    DEFAULT_ELEMENT = HtmlElement()

    def __init__(self,
                 html_tree,
                 display_images=False,
                 deduplicate_captions=False,
                 display_links=False,
                 css=None):
        '''
        ::param: display_images \
            whether to include image tiles/alt texts
        ::param: deduplicate_captions \
            whether to deduplicate captions such as image titles
            (many newspaper include images and video previews with
             identifical titles).
        ::param: display_links \
            whether to display link targets (e.g. `[Python](https://www.python.org)`)
        '''
        # setup config
        self.cfg_deduplicate_captions = deduplicate_captions
        self.css = css if css else DEFAULT_CSS

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self.start_table,
            'tr': self.start_tr,
            'td': self.start_td,
            'th': self.start_td,
            'ul': self.start_ul,
            'ol': self.start_ol,
            'li': self.start_li,
            'br': self.newline,
            'a': self.start_a if display_links else None,
            'img': self.start_img if display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self.end_table,
            'ul': self.end_ul,
            'ol': self.end_ol,
            'td': self.end_td,
            'th': self.end_td,
            'a': self.end_a if display_links else None,
        }

        # instance variables
        self.current_tag = [HtmlElement()]
        self.current_line = [Line()]
        self.next_line = [Line()]

        # the canvases used for displaying text
        # clean_text_line[0] refers to the root canvas; tables write into child
        # canvases that are created for every table line and merged with the
        # root canvas at the end of a table
        self.clean_text_lines = [[]]

        self.current_table = []
        self.li_counter = []
        self.li_level = 0
        self.invisible = [
        ]  # a list of attributes that are considered invisible
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self.crawl_tree(html_tree)
        if self.current_line[-1]:
            self.write_line()

    def crawl_tree(self, tree):
        if isinstance(tree.tag, str):
            self.handle_starttag(tree.tag, tree.attrib)
            if tree.text:
                self.handle_data(tree.text)

            for node in tree:
                self.crawl_tree(node)

            self.handle_endtag(tree.tag)

        if tree.tail:
            self.handle_data(tree.tail)

    def get_text(self):
        '''
        ::returns:
           a text representation of the parsed content
        '''
        return unescape('\n'.join(chain(*self.clean_text_lines))).rstrip()

    def write_line(self, force=False):
        '''
        Writes the current line to the buffer, provided that there is any
        data to write.

        ::returns:
            True, if a line has been writer, otherwise False
        '''
        # only break the line if there is any relevant content
        if not force and (not self.current_line[-1].content
                          or self.current_line[-1].content.isspace()):
            self.current_line[-1].margin_before = max(
                self.current_line[-1].margin_before,
                self.current_tag[-1].margin_before)
            return False

        line = self.current_line[-1].get_text()
        self.clean_text_lines[-1].append(line)
        self.current_line[-1] = self.next_line[-1]
        self.next_line[-1] = Line()
        return True

    def write_line_verbatim(self, text):
        '''
        Writes the current buffer without any modifications.
        '''
        self.clean_text_lines[-1].append(text)

    def handle_starttag(self, tag, attrs):
        # use the css to handle tags known to it :)

        cur = self.css.get(tag, Inscriptis.DEFAULT_ELEMENT)
        if 'style' in attrs:
            cur = CssParse.get_style_attribute(attrs['style'],
                                               html_element=cur)
        self.current_tag.append(cur)
        if cur.display == Display.none or self.invisible:
            self.invisible.append(cur)
            return

        self.next_line[
            -1].padding = self.current_line[-1].padding + cur.padding
        # flush text before display:block elements
        if cur.display == Display.block:
            if not self.write_line():
                self.current_line[-1].margin_before = max(
                    self.current_line[-1].margin_before, cur.margin_before)
                self.current_line[-1].padding = self.next_line[-1].padding
            else:
                self.current_line[-1].margin_after = max(
                    self.current_line[-1].margin_after, cur.margin_after)

        handler = self.start_tag_handler_dict.get(tag, None)
        if handler:
            handler(attrs)

    def handle_endtag(self, tag):
        cur = self.current_tag.pop()
        if self.invisible:
            self.invisible.pop()
            return

        self.next_line[
            -1].padding = self.current_line[-1].padding - cur.padding
        self.current_line[-1].margin_after = max(
            self.current_line[-1].margin_after, cur.margin_after)
        # flush text after display:block elements
        if cur.display == Display.block:
            # propagate the new padding to the current line, if nothing has
            # been written
            if not self.write_line():
                self.current_line[-1].padding = self.next_line[-1].padding

        handler = self.end_tag_handler_dict.get(tag, None)
        if handler:
            handler()

    def handle_data(self, data):
        if self.invisible:
            return

        # protect pre areas
        if self.current_tag[-1].whitespace == WhiteSpace.pre:
            data = '\0' + data + '\0'

        # add prefix, if present
        data = self.current_tag[-1].prefix + data + self.current_tag[-1].suffix

        # determine whether to add this content to a table column
        # or to a standard line
        self.current_line[-1].content += data

    def start_ul(self, attrs):
        self.li_level += 1
        self.li_counter.append(Inscriptis.get_bullet(self.li_level - 1))

    def end_ul(self):
        self.li_level -= 1
        self.li_counter.pop()

    def start_img(self, attrs):
        image_text = attrs.get('alt', '') or attrs.get('title', '')
        if image_text and not (self.cfg_deduplicate_captions
                               and image_text == self.last_caption):
            self.current_line[-1].content += '[{}]'.format(image_text)
            self.last_caption = image_text

    def start_a(self, attrs):
        self.link_target = attrs.get('href', '')
        self.current_line[-1].content += '['

    def end_a(self):
        self.current_line[-1].content += ']({})'.format(self.link_target)

    def start_ol(self, attrs):
        self.li_counter.append(1)
        self.li_level += 1

    def end_ol(self):
        self.li_level -= 1
        self.li_counter.pop()

    def start_li(self, attrs):
        self.write_line()
        if self.li_level > 0:
            bullet = self.li_counter[-1]
        else:
            bullet = "* "
        if isinstance(bullet, int):
            self.li_counter[-1] += 1
            self.current_line[-1].list_bullet = "{}. ".format(bullet)
        else:
            self.current_line[-1].list_bullet = bullet

    def start_table(self, attrs):
        self.current_table.append(Table())

    def start_tr(self, attrs):
        if self.current_table:
            # check whether we need to cleanup a <td> tag that has not been
            # closed yet
            if self.current_table[-1].td_is_open:
                self.end_td()

            self.current_table[-1].add_row()

    def start_td(self, attrs):
        if self.current_table:
            # check whether we need to cleanup a <td> tag that has not been
            # closed yet
            if self.current_table[-1].td_is_open:
                self.end_td()

            # open td tag
            self.clean_text_lines.append([])
            self.current_line.append(Line())
            self.next_line.append(Line())
            self.current_table[-1].add_cell(self.clean_text_lines[-1])
            self.current_table[-1].td_is_open = True

    def end_td(self):
        if self.current_table and self.current_table[-1].td_is_open:
            self.current_table[-1].td_is_open = False
            self.write_line(force=True)
            self.clean_text_lines.pop()
            self.current_line.pop()
            self.next_line.pop()

    def end_tr(self):
        pass

    def end_table(self):
        if self.current_table and self.current_table[-1].td_is_open:
            self.end_td()
        self.write_line()
        table = self.current_table.pop()
        self.write_line_verbatim(table.get_text())

    def newline(self, attrs):
        self.write_line(force=True)

    @staticmethod
    def get_bullet(index):
        '''
        ::return:
           the bullet that corresponds to the given index
        '''
        return Inscriptis.UL_COUNTER[index % Inscriptis.UL_COUNTER_LEN]
Пример #4
0
def test_style_unit_parsing():
    html_element = CssParse.get_style_attribute(
        "margin-top:2.666666667em;margin-bottom: 2.666666667em",
        html_element=HtmlElement())
    assert html_element.margin_before == 3
    assert html_element.margin_after == 3