Python HtmlElement примеры использования

Язык программирования: Python

Пространство имен/Пакет: inscriptis.css

Класс/Тип: HtmlElement

Примеров на hotexamples.com: 4

Python HtmlElement - 4 примера найдено. Это лучшие примеры Python кода для inscriptis.css.HtmlElement, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

HtmlElement(4)

Основные методы

HtmlElement (4)

Пример #1

Показать файл

Файл: __init__.py Проект: MindaugasVaitkus2/inscriptis

def get_text(html_content,
             display_images=False,
             deduplicate_captions=False,
             display_links=False,
             indentation='extended'):
    '''
    :param html_content: the html string to be converted to text
    :param display_images: whether to display image caption
    :param indentation: either 'standard' (solely based on the css) or 'extended'
        which intends divs and adds spaces between span tags
    '''
    html_content = html_content.strip()
    if not html_content:
        return ''

    if indentation == 'extended':
        css = DEFAULT_CSS.copy()
        css['div'] = HtmlElement('div', display=Display.block, padding=2)
        css['span'] = HtmlElement('span', prefix=' ', suffix=' ')
    else:
        css = DEFAULT_CSS

    # strip XML declaration, if necessary
    if html_content.startswith('<?xml '):
        html_content = RE_STRIP_XML_DECLARATION.sub('', html_content, count=1)

    html_tree = fromstring(html_content)
    parser = Inscriptis(html_tree,
                        display_images=display_images,
                        deduplicate_captions=deduplicate_captions,
                        display_links=display_links,
                        css=css)
    return parser.get_text()

Пример #2

Показать файл

    def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False):
        '''
        ::param: display_images \
            whether to include image tiles/alt texts
        ::param: deduplicate_captions \
            whether to deduplicate captions such as image titles
            (many newspaper include images and video previews with
             identifical titles).
        ::param: display_links \
            whether to display link targets (e.g. `[Python](https://www.python.org)`)
        '''
        # setup config
        self.cfg_deduplicate_captions = deduplicate_captions

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self.start_table,
            'tr': self.start_tr,
            'td': self.start_td,
            'th': self.start_td,
            'ul': self.start_ul,
            'ol': self.start_ol,
            'li': self.start_li,
            'br': self.newline,
            'a': self.start_a if display_links else None,
            'img': self.start_img if display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self.end_table,
            'ul': self.end_ul,
            'ol': self.end_ol,
            'td': self.end_td,
            'th': self.end_td,
            'a': self.end_a if display_links else None,
        }

        # instance variables
        self.current_tag = [HtmlElement()]
        self.current_line = [Line()]
        self.next_line = [Line()]

        # the canvases used for displaying text
        # clean_text_line[0] refers to the root canvas; tables write into child
        # canvases that are created for every table line and merged with the
        # root canvas at the end of a table
        self.clean_text_lines = [[]]

        self.current_table = []
        self.li_counter = []
        self.li_level = 0
        self.invisible = []  # a list of attributes that are considered invisible
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self.crawl_tree(html_tree)
        if self.current_line[-1]:
            self.write_line()

Пример #3

Показать файл

Файл: html_engine.py Проект: MindaugasVaitkus2/inscriptis

class Inscriptis(object):

    UL_COUNTER = ('* ', '+ ', 'o ', '- ')
    UL_COUNTER_LEN = len(UL_COUNTER)

    DEFAULT_ELEMENT = HtmlElement()

    def __init__(self,
                 html_tree,
                 display_images=False,
                 deduplicate_captions=False,
                 display_links=False,
                 css=None):
        '''
        ::param: display_images \
            whether to include image tiles/alt texts
        ::param: deduplicate_captions \
            whether to deduplicate captions such as image titles
            (many newspaper include images and video previews with
             identifical titles).
        ::param: display_links \
            whether to display link targets (e.g. `[Python](https://www.python.org)`)
        '''
        # setup config
        self.cfg_deduplicate_captions = deduplicate_captions
        self.css = css if css else DEFAULT_CSS

        # setup start and end tag call tables
        self.start_tag_handler_dict = {
            'table': self.start_table,
            'tr': self.start_tr,
            'td': self.start_td,
            'th': self.start_td,
            'ul': self.start_ul,
            'ol': self.start_ol,
            'li': self.start_li,
            'br': self.newline,
            'a': self.start_a if display_links else None,
            'img': self.start_img if display_images else None,
        }
        self.end_tag_handler_dict = {
            'table': self.end_table,
            'ul': self.end_ul,
            'ol': self.end_ol,
            'td': self.end_td,
            'th': self.end_td,
            'a': self.end_a if display_links else None,
        }

        # instance variables
        self.current_tag = [HtmlElement()]
        self.current_line = [Line()]
        self.next_line = [Line()]

        # the canvases used for displaying text
        # clean_text_line[0] refers to the root canvas; tables write into child
        # canvases that are created for every table line and merged with the
        # root canvas at the end of a table
        self.clean_text_lines = [[]]

        self.current_table = []
        self.li_counter = []
        self.li_level = 0
        self.invisible = [
        ]  # a list of attributes that are considered invisible
        self.last_caption = None

        # used if display_links is enabled
        self.link_target = ''

        # crawl the html tree
        self.crawl_tree(html_tree)
        if self.current_line[-1]:
            self.write_line()

    def crawl_tree(self, tree):
        if isinstance(tree.tag, str):
            self.handle_starttag(tree.tag, tree.attrib)
            if tree.text:
                self.handle_data(tree.text)

            for node in tree:
                self.crawl_tree(node)

            self.handle_endtag(tree.tag)

        if tree.tail:
            self.handle_data(tree.tail)

    def get_text(self):
        '''
        ::returns:
           a text representation of the parsed content
        '''
        return unescape('\n'.join(chain(*self.clean_text_lines))).rstrip()

    def write_line(self, force=False):
        '''
        Writes the current line to the buffer, provided that there is any
        data to write.

        ::returns:
            True, if a line has been writer, otherwise False
        '''
        # only break the line if there is any relevant content
        if not force and (not self.current_line[-1].content
                          or self.current_line[-1].content.isspace()):
            self.current_line[-1].margin_before = max(
                self.current_line[-1].margin_before,
                self.current_tag[-1].margin_before)
            return False

        line = self.current_line[-1].get_text()
        self.clean_text_lines[-1].append(line)
        self.current_line[-1] = self.next_line[-1]
        self.next_line[-1] = Line()
        return True

    def write_line_verbatim(self, text):
        '''
        Writes the current buffer without any modifications.
        '''
        self.clean_text_lines[-1].append(text)

    def handle_starttag(self, tag, attrs):
        # use the css to handle tags known to it :)

        cur = self.css.get(tag, Inscriptis.DEFAULT_ELEMENT)
        if 'style' in attrs:
            cur = CssParse.get_style_attribute(attrs['style'],
                                               html_element=cur)
        self.current_tag.append(cur)
        if cur.display == Display.none or self.invisible:
            self.invisible.append(cur)
            return

        self.next_line[
            -1].padding = self.current_line[-1].padding + cur.padding
        # flush text before display:block elements
        if cur.display == Display.block:
            if not self.write_line():
                self.current_line[-1].margin_before = max(
                    self.current_line[-1].margin_before, cur.margin_before)
                self.current_line[-1].padding = self.next_line[-1].padding
            else:
                self.current_line[-1].margin_after = max(
                    self.current_line[-1].margin_after, cur.margin_after)

        handler = self.start_tag_handler_dict.get(tag, None)
        if handler:
            handler(attrs)

    def handle_endtag(self, tag):
        cur = self.current_tag.pop()
        if self.invisible:
            self.invisible.pop()
            return

        self.next_line[
            -1].padding = self.current_line[-1].padding - cur.padding
        self.current_line[-1].margin_after = max(
            self.current_line[-1].margin_after, cur.margin_after)
        # flush text after display:block elements
        if cur.display == Display.block:
            # propagate the new padding to the current line, if nothing has
            # been written
            if not self.write_line():
                self.current_line[-1].padding = self.next_line[-1].padding

        handler = self.end_tag_handler_dict.get(tag, None)
        if handler:
            handler()

    def handle_data(self, data):
        if self.invisible:
            return

        # protect pre areas
        if self.current_tag[-1].whitespace == WhiteSpace.pre:
            data = '\0' + data + '\0'

        # add prefix, if present
        data = self.current_tag[-1].prefix + data + self.current_tag[-1].suffix

        # determine whether to add this content to a table column
        # or to a standard line
        self.current_line[-1].content += data

    def start_ul(self, attrs):
        self.li_level += 1
        self.li_counter.append(Inscriptis.get_bullet(self.li_level - 1))

    def end_ul(self):
        self.li_level -= 1
        self.li_counter.pop()

    def start_img(self, attrs):
        image_text = attrs.get('alt', '') or attrs.get('title', '')
        if image_text and not (self.cfg_deduplicate_captions
                               and image_text == self.last_caption):
            self.current_line[-1].content += '[{}]'.format(image_text)
            self.last_caption = image_text

    def start_a(self, attrs):
        self.link_target = attrs.get('href', '')
        self.current_line[-1].content += '['

    def end_a(self):
        self.current_line[-1].content += ']({})'.format(self.link_target)

    def start_ol(self, attrs):
        self.li_counter.append(1)
        self.li_level += 1

    def end_ol(self):
        self.li_level -= 1
        self.li_counter.pop()

    def start_li(self, attrs):
        self.write_line()
        if self.li_level > 0:
            bullet = self.li_counter[-1]
        else:
            bullet = "* "
        if isinstance(bullet, int):
            self.li_counter[-1] += 1
            self.current_line[-1].list_bullet = "{}. ".format(bullet)
        else:
            self.current_line[-1].list_bullet = bullet

    def start_table(self, attrs):
        self.current_table.append(Table())

    def start_tr(self, attrs):
        if self.current_table:
            # check whether we need to cleanup a <td> tag that has not been
            # closed yet
            if self.current_table[-1].td_is_open:
                self.end_td()

            self.current_table[-1].add_row()

    def start_td(self, attrs):
        if self.current_table:
            # check whether we need to cleanup a <td> tag that has not been
            # closed yet
            if self.current_table[-1].td_is_open:
                self.end_td()

            # open td tag
            self.clean_text_lines.append([])
            self.current_line.append(Line())
            self.next_line.append(Line())
            self.current_table[-1].add_cell(self.clean_text_lines[-1])
            self.current_table[-1].td_is_open = True

    def end_td(self):
        if self.current_table and self.current_table[-1].td_is_open:
            self.current_table[-1].td_is_open = False
            self.write_line(force=True)
            self.clean_text_lines.pop()
            self.current_line.pop()
            self.next_line.pop()

    def end_tr(self):
        pass

    def end_table(self):
        if self.current_table and self.current_table[-1].td_is_open:
            self.end_td()
        self.write_line()
        table = self.current_table.pop()
        self.write_line_verbatim(table.get_text())

    def newline(self, attrs):
        self.write_line(force=True)

    @staticmethod
    def get_bullet(index):
        '''
        ::return:
           the bullet that corresponds to the given index
        '''
        return Inscriptis.UL_COUNTER[index % Inscriptis.UL_COUNTER_LEN]

Пример #4

Показать файл

def test_style_unit_parsing():
    html_element = CssParse.get_style_attribute(
        "margin-top:2.666666667em;margin-bottom: 2.666666667em",
        html_element=HtmlElement())
    assert html_element.margin_before == 3
    assert html_element.margin_after == 3