def close(self): HTMLParser.HTMLParser.close(self) try: nochr = unicode('') except NameError: nochr = str('') self.pbr() self.o('', 0, 'end') outtext = nochr.join(self.outtextlist) if self.unicode_snob: try: nbsp = unichr(name2cp('nbsp')) except NameError: nbsp = chr(name2cp('nbsp')) else: try: nbsp = unichr(32) except NameError: nbsp = chr(32) try: outtext = outtext.replace(unicode(' _place_holder;'), nbsp) except NameError: outtext = outtext.replace(' _place_holder;', nbsp) # Clear self.outtextlist to avoid memory leak of its content to # the next handling. self.outtextlist = [] return outtext
def close(self): HTMLParser.HTMLParser.close(self) try: nochr = unicode("") except NameError: nochr = str("") self.pbr() self.o("", 0, "end") outtext = nochr.join(self.outtextlist) if self.unicode_snob: try: nbsp = unichr(name2cp("nbsp")) except NameError: nbsp = chr(name2cp("nbsp")) else: try: nbsp = unichr(32) except NameError: nbsp = chr(32) try: outtext = outtext.replace(unicode(" _place_holder;"), nbsp) except NameError: outtext = outtext.replace(" _place_holder;", nbsp) # Clear self.outtextlist to avoid memory leak of its content to # the next handling. self.outtextlist = [] return outtext
def entityref(self, c): if not self.unicode_snob and c in config.UNIFIABLE.keys(): return config.UNIFIABLE[c] else: try: name2cp(c) except KeyError: return "&" + c + ';' else: if c == 'nbsp': return config.UNIFIABLE[c] else: return chr(name2cp(c))
def entityref(self, c): if not self.unicode_snob and c in config.UNIFIABLE: return config.UNIFIABLE[c] else: try: name2cp(c) except KeyError: return "&" + c + ";" else: if c == "nbsp": return config.UNIFIABLE[c] else: return chr(name2cp(c))
def close(self): HTMLParser.HTMLParser.close(self) try: nochr = unicode('') unicode_character = unichr except NameError: nochr = str('') unicode_character = chr self.pbr() self.o('', 0, 'end') outtext = nochr.join(self.outtextlist) if self.unicode_snob: nbsp = unicode_character(name2cp('nbsp')) else: nbsp = unicode_character(32) try: outtext = outtext.replace(unicode(' _place_holder;'), nbsp) except NameError: outtext = outtext.replace(' _place_holder;', nbsp) # Clear self.outtextlist to avoid memory leak of its content to # the next handling. self.outtextlist = [] return outtext
def close(self): HTMLParser.HTMLParser.close(self) self.pbr() self.o("", force="end") outtext = nochr.join(self.outtextlist) if self.unicode_snob: nbsp = chr(name2cp("nbsp")) else: nbsp = chr(32) outtext = outtext.replace(" _place_holder;", nbsp) # Clear self.outtextlist to avoid memory leak of its content to # the next handling. self.outtextlist = [] return outtext
def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH): """ Input parameters: out: possible custom replacement for self.outtextf (which appends lines of text). baseurl: base URL of the document we process """ kwargs = {} if sys.version_info >= (3, 4): kwargs["convert_charrefs"] = False HTMLParser.HTMLParser.__init__(self, **kwargs) # Config options self.split_next_td = False self.td_count = 0 self.table_start = False self.unicode_snob = config.UNICODE_SNOB # covered in cli self.escape_snob = config.ESCAPE_SNOB # covered in cli self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH self.body_width = bodywidth # covered in cli self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli self.inline_links = config.INLINE_LINKS # covered in cli self.protect_links = config.PROTECT_LINKS # covered in cli self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli self.ignore_links = config.IGNORE_ANCHORS # covered in cli self.ignore_images = config.IGNORE_IMAGES # covered in cli self.images_to_alt = config.IMAGES_TO_ALT # covered in cli self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli self.bypass_tables = config.BYPASS_TABLES # covered in cli self.google_doc = False # covered in cli self.ul_item_mark = "*" # covered in cli self.emphasis_mark = "_" # covered in cli self.strong_mark = "**" self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli self.hide_strikethrough = False # covered in cli self.mark_code = config.MARK_CODE self.wrap_links = config.WRAP_LINKS # covered in cli self.tag_callback = None if out is None: # pragma: no cover self.out = self.outtextf else: # pragma: no cover self.out = out # empty list to store output characters before they are "joined" self.outtextlist = [] self.quiet = 0 self.p_p = 0 # number of newline character to print before next output self.outcount = 0 self.start = 1 self.space = 0 self.a = [] self.astack = [] self.maybe_automatic_link = None self.empty_link = False self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://") self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 self.startpre = 0 self.code = False self.br_toggle = "" self.lastWasNL = 0 self.lastWasList = False self.style = 0 self.style_def = {} self.tag_stack = [] self.emphasis = 0 self.drop_white_space = 0 self.inheader = False self.abbr_title = None # current abbreviation definition self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl try: del unifiable_n[name2cp("nbsp")] except KeyError: pass config.UNIFIABLE["nbsp"] = " _place_holder;"
def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): """ Input parameters: out: possible custom replacement for self.outtextf (which appends lines of text). baseurl: base URL of the document we process """ kwargs = {} if sys.version_info >= (3, 4): kwargs['convert_charrefs'] = False HTMLParser.HTMLParser.__init__(self, **kwargs) # Config options self.split_next_td = False self.td_count = 0 self.table_start = False self.unicode_snob = config.UNICODE_SNOB # covered in cli self.escape_snob = config.ESCAPE_SNOB # covered in cli self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH self.body_width = bodywidth # covered in cli self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli self.inline_links = config.INLINE_LINKS # covered in cli self.protect_links = config.PROTECT_LINKS # covered in cli self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli self.ignore_links = config.IGNORE_ANCHORS # covered in cli self.ignore_images = config.IGNORE_IMAGES # covered in cli self.images_to_alt = config.IMAGES_TO_ALT # covered in cli self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli self.bypass_tables = config.BYPASS_TABLES # covered in cli self.ignore_tables = config.IGNORE_TABLES # covered in cli self.google_doc = False # covered in cli self.ul_item_mark = '*' # covered in cli self.emphasis_mark = '_' # covered in cli self.strong_mark = '**' self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli self.hide_strikethrough = False # covered in cli self.mark_code = config.MARK_CODE self.wrap_links = config.WRAP_LINKS # covered in cli self.pad_tables = config.PAD_TABLES # covered in cli self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli self.tag_callback = None if out is None: # pragma: no cover self.out = self.outtextf else: # pragma: no cover self.out = out # empty list to store output characters before they are "joined" self.outtextlist = [] self.quiet = 0 self.p_p = 0 # number of newline character to print before next output self.outcount = 0 self.start = 1 self.space = 0 self.a = [] self.astack = [] self.maybe_automatic_link = None self.empty_link = False self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 self.startpre = 0 self.code = False self.br_toggle = '' self.lastWasNL = 0 self.lastWasList = False self.style = 0 self.style_def = {} self.tag_stack = [] self.emphasis = 0 self.drop_white_space = 0 self.inheader = False self.abbr_title = None # current abbreviation definition self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl self.stressed = False self.preceding_stressed = False self.preceding_data = None self.current_tag = None try: del unifiable_n[name2cp('nbsp')] except KeyError: pass config.UNIFIABLE['nbsp'] = ' _place_holder;'
def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): """ Input parameters: out: possible custom replacement for self.outtextf (which appends lines of text). baseurl: base URL of the document we process """ HTMLParser.HTMLParser.__init__(self) # Config options self.split_next_td = False self.td_count = 0 self.table_start = False self.unicode_snob = config.UNICODE_SNOB self.escape_snob = config.ESCAPE_SNOB self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH self.body_width = bodywidth self.skip_internal_links = config.SKIP_INTERNAL_LINKS self.inline_links = config.INLINE_LINKS self.protect_links = config.PROTECT_LINKS self.google_list_indent = config.GOOGLE_LIST_INDENT self.ignore_links = config.IGNORE_ANCHORS self.ignore_images = config.IGNORE_IMAGES self.images_to_alt = config.IMAGES_TO_ALT self.images_with_size = config.IMAGES_WITH_SIZE self.ignore_emphasis = config.IGNORE_EMPHASIS self.bypass_tables = config.BYPASS_TABLES self.google_doc = False self.ul_item_mark = '*' self.emphasis_mark = '_' self.strong_mark = '**' self.single_line_break = config.SINGLE_LINE_BREAK if out is None: self.out = self.outtextf else: self.out = out # empty list to store output characters before they are "joined" self.outtextlist = [] self.quiet = 0 self.p_p = 0 # number of newline character to print before next output self.outcount = 0 self.start = 1 self.space = 0 self.a = [] self.astack = [] self.maybe_automatic_link = None self.empty_link = False self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 self.startpre = 0 self.code = False self.br_toggle = '' self.lastWasNL = 0 self.lastWasList = False self.style = 0 self.style_def = {} self.tag_stack = [] self.emphasis = 0 self.drop_white_space = 0 self.inheader = False self.abbr_title = None # current abbreviation definition self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl try: del unifiable_n[name2cp('nbsp')] except KeyError: pass config.UNIFIABLE['nbsp'] = ' _place_holder;'
def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): """ Input parameters: out: possible custom replacement for self.outtextf (which appends lines of text). baseurl: base URL of the document we process """ HTMLParser.HTMLParser.__init__(self) # Config options self.split_next_td = False self.td_count = 0 self.table_start = False self.unicode_snob = config.UNICODE_SNOB self.escape_snob = config.ESCAPE_SNOB self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH self.body_width = bodywidth self.skip_internal_links = config.SKIP_INTERNAL_LINKS self.inline_links = config.INLINE_LINKS self.protect_links = config.PROTECT_LINKS self.google_list_indent = config.GOOGLE_LIST_INDENT self.ignore_links = config.IGNORE_ANCHORS self.ignore_images = config.IGNORE_IMAGES self.images_to_alt = config.IMAGES_TO_ALT self.ignore_emphasis = config.IGNORE_EMPHASIS self.bypass_tables = config.BYPASS_TABLES self.google_doc = False self.ul_item_mark = '*' self.emphasis_mark = '_' self.strong_mark = '**' self.single_line_break = config.SINGLE_LINE_BREAK if out is None: self.out = self.outtextf else: self.out = out # empty list to store output characters before they are "joined" self.outtextlist = [] self.quiet = 0 self.p_p = 0 # number of newline character to print before next output self.outcount = 0 self.start = 1 self.space = 0 self.a = [] self.astack = [] self.maybe_automatic_link = None self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') self.acount = 0 self.list = [] self.blockquote = 0 self.pre = 0 self.startpre = 0 self.code = False self.br_toggle = '' self.lastWasNL = 0 self.lastWasList = False self.style = 0 self.style_def = {} self.tag_stack = [] self.emphasis = 0 self.drop_white_space = 0 self.inheader = False self.abbr_title = None # current abbreviation definition self.abbr_data = None # last inner HTML (for abbr being defined) self.abbr_list = {} # stack of abbreviations to write later self.baseurl = baseurl try: del unifiable_n[name2cp('nbsp')] except KeyError: pass config.UNIFIABLE['nbsp'] = ' _place_holder;'