def __init__(self, html_tree, display_images=False, deduplicate_captions=False, display_links=False, css=None): # setup config self.cfg_deduplicate_captions = deduplicate_captions self.css = css if css else CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME] # setup start and end tag call tables self.start_tag_handler_dict = { 'table': self.start_table, 'tr': self.start_tr, 'td': self.start_td, 'th': self.start_td, 'ul': self.start_ul, 'ol': self.start_ol, 'li': self.start_li, 'br': self.newline, 'a': self.start_a if display_links else None, 'img': self.start_img if display_images else None, } self.end_tag_handler_dict = { 'table': self.end_table, 'ul': self.end_ul, 'ol': self.end_ol, 'td': self.end_td, 'th': self.end_td, 'a': self.end_a if display_links else None, } # instance variables self.current_tag = [HtmlElement()] self.current_line = [Line()] self.next_line = [Line()] # the canvases used for displaying text # clean_text_line[0] refers to the root canvas; tables write into child # canvases that are created for every table line and merged with the # root canvas at the end of a table self.clean_text_lines = [[]] self.current_table = [] self.li_counter = [] self.li_level = 0 self.invisible = [] # attributes that are considered invisible self.last_caption = None # used if display_links is enabled self.link_target = '' # crawl the html tree self.crawl_tree(html_tree) if self.current_line[-1]: self.write_line()
def start_td(self, attrs): if self.current_table: # check whether we need to cleanup a <td> tag that has not been # closed yet if self.current_table[-1].td_is_open: self.end_td() # open td tag self.clean_text_lines.append([]) self.current_line.append(Line()) self.next_line.append(Line()) self.current_table[-1].add_cell(self.clean_text_lines[-1]) self.current_table[-1].td_is_open = True
def __init__(self, html_tree, config=None): # use the default configuration, if no config object is provided self.config = config or ParserConfig() # setup start and end tag call tables self.start_tag_handler_dict = { 'table': self._start_table, 'tr': self._start_tr, 'td': self._start_td, 'th': self._start_td, 'ul': self._start_ul, 'ol': self._start_ol, 'li': self._start_li, 'br': self._newline, 'a': self._start_a if self.config.parse_a() else None, 'img': self._start_img if self.config.display_images else None, } self.end_tag_handler_dict = { 'table': self._end_table, 'ul': self._end_ul, 'ol': self._end_ol, 'td': self._end_td, 'th': self._end_td, 'a': self._end_a if self.config.parse_a() else None, } # instance variables self.current_tag = [self.config.css['body']] self.current_line = [Line()] self.next_line = [Line()] # the canvases used for displaying text # clean_text_line[0] refers to the root canvas; tables write into child # canvases that are created for every table line and merged with the # root canvas at the end of a table self.clean_text_lines = [[]] self.current_table = [] self.li_counter = [] self.li_level = 0 self.last_caption = None # used if display_links is enabled self.link_target = '' # crawl the html tree self._parse_html_tree(html_tree) if self.current_line[-1]: self._write_line()
def test_cell_formatting(): # standard line line = Line() line.margin_before = 0 line.margin_after = 0 line.prefix = '' line.suffix = '' line.content = 'Ehre sei Gott!' line.list_bullet = '' line.padding = 0 assert line.get_text() == 'Ehre sei Gott!' # string representation assert str(line) == \ "<Line: 'Ehre sei Gott!'>" # add margins line.margin_before = 1 line.margin_after = 2 assert line.get_text() == '\nEhre sei Gott!\n\n' # list bullet without padding line.list_bullet = "* " assert line.get_text() == '\n* Ehre sei Gott!\n\n' # add a padding line.padding = 3 assert line.get_text() == '\n * Ehre sei Gott!\n\n' # and prefixes + suffixes line.prefix = '>>' line.suffix = '<<' assert line.get_text() == '\n * >>Ehre sei Gott!<<\n\n'
def write_line(self, force=False): ''' Writes the current line to the buffer, provided that there is any data to write. Returns: bool -- True, if a line has been writer, otherwise False. ''' # only break the line if there is any relevant content if not force and (not self.current_line[-1].content or self.current_line[-1].content.isspace()): self.current_line[-1].margin_before = \ max(self.current_line[-1].margin_before, self.current_tag[-1].margin_before) return False line = self.current_line[-1].get_text() self.clean_text_lines[-1].append(line) self.current_line[-1] = self.next_line[-1] self.next_line[-1] = Line() return True