def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False, css=None): # setup config self.cfg_deduplicate_captions = deduplicate_captions self.css = css if css else CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME] # setup start and end tag call tables self.start_tag_handler_dict = { 'table': self.start_table, 'tr': self.start_tr, 'td': self.start_td, 'th': self.start_td, 'ul': self.start_ul, 'ol': self.start_ol, 'li': self.start_li, 'br': self.newline, 'a': self.start_a if display_links else None, 'img': self.start_img if display_images else None, } self.end_tag_handler_dict = { 'table': self.end_table, 'ul': self.end_ul, 'ol': self.end_ol, 'td': self.end_td, 'th': self.end_td, 'a': self.end_a if display_links else None, } # instance variables self.current_tag = [HtmlElement()] self.current_line = [Line()] self.next_line = [Line()] # the canvases used for displaying text # clean_text_line[0] refers to the root canvas; tables write into child # canvases that are created for every table line and merged with the # root canvas at the end of a table self.clean_text_lines = [[]] self.current_table = [] self.li_counter = [] self.li_level = 0 self.invisible = [] # attributes that are considered invisible self.last_caption = None # used if display_links is enabled self.link_target = '' # crawl the html tree self.crawl_tree(html_tree) if self.current_line[-1]: self.write_line()
def test_html_element_str(): ''' Tests the string representation of an HtmlElement. ''' html_element = HtmlElement('div', '', '', Display.inline, 0, 0, 0, WhiteSpace.pre) assert str(html_element) == ('<div prefix=, suffix=, ' 'display=Display.inline, margin_before=0, ' 'margin_after=0, padding=0, ' 'whitespace=WhiteSpace.pre>')
def test_style_unit_parsing(): html_element = CssParse.get_style_attribute( "margin-top:2.666666667em;margin-bottom: 2.666666667em", html_element=HtmlElement()) assert html_element.margin_before == 3 assert html_element.margin_after == 3
class Inscriptis(object): ''' The Inscriptis class translates an lxml HTML tree to the corresponding text representation. Args: html_tree: the lxml HTML tree to convert. display_images: whether to include image tiles/alt texts. deduplicate_captions: whether to deduplicate captions such as image titles (many newspaper include images and video previews with identifical titles). display_links: whether to display link targets (e.g. `[Python](https://www.python.org)`). css: an optional custom CSS definition. Example:: from lxml.html import fromstring from inscriptis.html_engine import Inscriptis html_content = "<html><body><h1>Test</h1></body></html>" # create an HTML tree from the HTML content. html_tree = fromstring(html_content) # transform the HTML tree to text. parser = Inscriptis(html_tree) text = parser.get_text() ''' UL_COUNTER = ('* ', '+ ', 'o ', '- ') UL_COUNTER_LEN = len(UL_COUNTER) DEFAULT_ELEMENT = HtmlElement() def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False, css=None): # setup config self.cfg_deduplicate_captions = deduplicate_captions self.css = css if css else CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME] # setup start and end tag call tables self.start_tag_handler_dict = { 'table': self.start_table, 'tr': self.start_tr, 'td': self.start_td, 'th': self.start_td, 'ul': self.start_ul, 'ol': self.start_ol, 'li': self.start_li, 'br': self.newline, 'a': self.start_a if display_links else None, 'img': self.start_img if display_images else None, } self.end_tag_handler_dict = { 'table': self.end_table, 'ul': self.end_ul, 'ol': self.end_ol, 'td': self.end_td, 'th': self.end_td, 'a': self.end_a if display_links else None, } # instance variables self.current_tag = [HtmlElement()] self.current_line = [Line()] self.next_line = [Line()] # the canvases used for displaying text # clean_text_line[0] refers to the root canvas; tables write into child # canvases that are created for every table line and merged with the # root canvas at the end of a table self.clean_text_lines = [[]] self.current_table = [] self.li_counter = [] self.li_level = 0 self.invisible = [] # attributes that are considered invisible self.last_caption = None # used if display_links is enabled self.link_target = '' # crawl the html tree self.crawl_tree(html_tree) if self.current_line[-1]: self.write_line() def crawl_tree(self, tree): if isinstance(tree.tag, str): self.handle_starttag(tree.tag, tree.attrib) if tree.text: self.handle_data(tree.text) for node in tree: self.crawl_tree(node) self.handle_endtag(tree.tag) if tree.tail: self.handle_data(tree.tail) def get_text(self): ''' Returns: str -- A text representation of the parsed content. ''' return unescape('\n'.join(chain(*self.clean_text_lines))).rstrip() def write_line(self, force=False): ''' Writes the current line to the buffer, provided that there is any data to write. Returns: bool -- True, if a line has been writer, otherwise False. ''' # only break the line if there is any relevant content if not force and (not self.current_line[-1].content or self.current_line[-1].content.isspace()): self.current_line[-1].margin_before = \ max(self.current_line[-1].margin_before, self.current_tag[-1].margin_before) return False line = self.current_line[-1].get_text() self.clean_text_lines[-1].append(line) self.current_line[-1] = self.next_line[-1] self.next_line[-1] = Line() return True def write_line_verbatim(self, text): ''' Writes the current buffer without any modifications. Args: text (str): the text to write. ''' self.clean_text_lines[-1].append(text) def handle_starttag(self, tag, attrs): ''' Handels HTML start tags. Args: tag (str): the HTML start tag to process. attrs (dict): a dictionary of HTML attributes and their respective values. ''' # use the css to handle tags known to it :) cur = self.css.get(tag, Inscriptis.DEFAULT_ELEMENT) if 'style' in attrs: cur = CssParse.get_style_attribute(attrs['style'], html_element=cur) self.current_tag.append(cur) if cur.display == Display.none or self.invisible: self.invisible.append(cur) return self.next_line[-1].padding = self.current_line[-1].padding \ + cur.padding # flush text before display:block elements if cur.display == Display.block: if not self.write_line(): self.current_line[-1].margin_before = max( self.current_line[-1].margin_before, cur.margin_before) self.current_line[-1].padding = self.next_line[-1].padding else: self.current_line[-1].margin_after = max( self.current_line[-1].margin_after, cur.margin_after) handler = self.start_tag_handler_dict.get(tag, None) if handler: handler(attrs) def handle_endtag(self, tag): ''' Handels HTML end tags. Args: tag(str): the HTML end tag to process. ''' cur = self.current_tag.pop() if self.invisible: self.invisible.pop() return self.next_line[-1].padding = self.current_line[-1].padding \ - cur.padding self.current_line[-1].margin_after = max( self.current_line[-1].margin_after, cur.margin_after) # flush text after display:block elements if cur.display == Display.block: # propagate the new padding to the current line, if nothing has # been written if not self.write_line(): self.current_line[-1].padding = self.next_line[-1].padding handler = self.end_tag_handler_dict.get(tag, None) if handler: handler() def handle_data(self, data): ''' Handels text belonging to HTML tags. Args: data (str): The text to process. ''' if self.invisible: return # protect pre areas if self.current_tag[-1].whitespace == WhiteSpace.pre: data = '\0' + data + '\0' # add prefix, if present data = self.current_tag[-1].prefix + data + self.current_tag[-1].suffix # determine whether to add this content to a table column # or to a standard line self.current_line[-1].content += data def start_ul(self, attrs): self.li_level += 1 self.li_counter.append(Inscriptis.get_bullet(self.li_level - 1)) def end_ul(self): self.li_level -= 1 self.li_counter.pop() def start_img(self, attrs): image_text = attrs.get('alt', '') or attrs.get('title', '') if image_text and not (self.cfg_deduplicate_captions and image_text == self.last_caption): self.current_line[-1].content += '[{}]'.format(image_text) self.last_caption = image_text def start_a(self, attrs): self.link_target = attrs.get('href', '') self.current_line[-1].content += '[' def end_a(self): self.current_line[-1].content += ']({})'.format(self.link_target) def start_ol(self, attrs): self.li_counter.append(1) self.li_level += 1 def end_ol(self): self.li_level -= 1 self.li_counter.pop() def start_li(self, attrs): self.write_line() if self.li_level > 0: bullet = self.li_counter[-1] else: bullet = "* " if isinstance(bullet, int): self.li_counter[-1] += 1 self.current_line[-1].list_bullet = "{}. ".format(bullet) else: self.current_line[-1].list_bullet = bullet def start_table(self, attrs): self.current_table.append(Table()) def start_tr(self, attrs): if self.current_table: # check whether we need to cleanup a <td> tag that has not been # closed yet if self.current_table[-1].td_is_open: self.end_td() self.current_table[-1].add_row() def start_td(self, attrs): if self.current_table: # check whether we need to cleanup a <td> tag that has not been # closed yet if self.current_table[-1].td_is_open: self.end_td() # open td tag self.clean_text_lines.append([]) self.current_line.append(Line()) self.next_line.append(Line()) self.current_table[-1].add_cell(self.clean_text_lines[-1]) self.current_table[-1].td_is_open = True def end_td(self): if self.current_table and self.current_table[-1].td_is_open: self.current_table[-1].td_is_open = False self.write_line(force=True) self.clean_text_lines.pop() self.current_line.pop() self.next_line.pop() def end_tr(self): pass def end_table(self): if self.current_table and self.current_table[-1].td_is_open: self.end_td() self.write_line() table = self.current_table.pop() self.write_line_verbatim(table.get_text()) def newline(self, attrs): self.write_line(force=True) @staticmethod def get_bullet(index): ''' Returns: str -- The bullet that corresponds to the given index. ''' return Inscriptis.UL_COUNTER[index % Inscriptis.UL_COUNTER_LEN]
# coding: utf-8 ''' Standard CSS profiles shipped with inscriptis. - `strict`: this profile corresponds to the defaults used by Firefox - `relaxed`: this profile is more suited for text analytics, since it ensures that whitespaces are inserted between span and div elements preventing cases where two words stick together. ''' from inscriptis.model.css import HtmlElement from inscriptis.html_properties import Display, WhiteSpace STRICT_CSS_PROFILE = { 'body': HtmlElement('body', display=Display.inline, whitespace=WhiteSpace.normal), 'head': HtmlElement('head', display=Display.none), 'link': HtmlElement('link', display=Display.none), 'meta': HtmlElement('meta', display=Display.none), 'script': HtmlElement('script', display=Display.none), 'title': HtmlElement('title', display=Display.none), 'style': HtmlElement('style', display=Display.none), 'p': HtmlElement('p', display=Display.block, margin_before=1, margin_after=1), 'figure':
# coding: utf-8 ''' Standard CSS profiles shipped with inscriptis. - `strict`: this profile corrisponds to the defaults used by Firefox - `relaxed`: this profile is more suited for text analytics, since it ensures that whitespaces are inserted between span and div elements preventing cases where two words stick together. ''' from inscriptis.model.css import HtmlElement from inscriptis.html_properties import Display STRICT_CSS_PROFILE = { 'head': HtmlElement('head', display=Display.none), 'link': HtmlElement('link', display=Display.none), 'meta': HtmlElement('meta', display=Display.none), 'script': HtmlElement('script', display=Display.none), 'title': HtmlElement('title', display=Display.none), 'style': HtmlElement('style', display=Display.none), 'p': HtmlElement('p', display=Display.block, margin_before=1, margin_after=1), 'figure': HtmlElement('figure', display=Display.block,
def test_html_element_refinement(): new = HtmlElement('span', display=Display.inline, prefix=' ', suffix=' ', limit_whitespace_affixes=True) pre = HtmlElement('pre', display=Display.block, whitespace=WhiteSpace.pre) code = HtmlElement('code') # refinement with pre and whitespaces refined = pre.get_refined_html_element(new) assert refined.prefix == '' assert refined.suffix == '' # refinement with code and whitespaces refined = code.get_refined_html_element(new) assert refined.prefix == ' ' assert refined.suffix == ' ' # refinement with pre and non-whitespaces new.prefix = ' 1. ' new.suffix = '<' refined = pre.get_refined_html_element(new) assert refined.prefix == ' 1. ' assert refined.suffix == '<' # refinement with code and non-whitespaces refined = code.get_refined_html_element(new) assert refined.prefix == ' 1. ' assert refined.suffix == '<'