Python name2cp 예제들, html2text.utils.name2cp Python 예제들

예제 #1

0

파일 보기

파일: __init__.py 프로젝트: smblackburn/html2text

    def close(self):
        HTMLParser.HTMLParser.close(self)

        try:
            nochr = unicode('')
        except NameError:
            nochr = str('')

        self.pbr()
        self.o('', 0, 'end')

        outtext = nochr.join(self.outtextlist)
        if self.unicode_snob:
            try:
                nbsp = unichr(name2cp('nbsp'))
            except NameError:
                nbsp = chr(name2cp('nbsp'))
        else:
            try:
                nbsp = unichr(32)
            except NameError:
                nbsp = chr(32)
        try:
            outtext = outtext.replace(unicode('&nbsp_place_holder;'), nbsp)
        except NameError:
            outtext = outtext.replace('&nbsp_place_holder;', nbsp)

        # Clear self.outtextlist to avoid memory leak of its content to
        # the next handling.
        self.outtextlist = []

        return outtext

예제 #2

0

파일 보기

파일: __init__.py 프로젝트: PlushBeaver/FanFicFare

    def close(self):
        HTMLParser.HTMLParser.close(self)

        try:
            nochr = unicode("")
        except NameError:
            nochr = str("")

        self.pbr()
        self.o("", 0, "end")

        outtext = nochr.join(self.outtextlist)
        if self.unicode_snob:
            try:
                nbsp = unichr(name2cp("nbsp"))
            except NameError:
                nbsp = chr(name2cp("nbsp"))
        else:
            try:
                nbsp = unichr(32)
            except NameError:
                nbsp = chr(32)
        try:
            outtext = outtext.replace(unicode("&nbsp_place_holder;"), nbsp)
        except NameError:
            outtext = outtext.replace("&nbsp_place_holder;", nbsp)

        # Clear self.outtextlist to avoid memory leak of its content to
        # the next handling.
        self.outtextlist = []

        return outtext

예제 #3

0

파일 보기

파일: __init__.py 프로젝트: murrayk-bishops/book

 def entityref(self, c):
     if not self.unicode_snob and c in config.UNIFIABLE.keys():
         return config.UNIFIABLE[c]
     else:
         try:
             name2cp(c)
         except KeyError:
             return "&" + c + ';'
         else:
             if c == 'nbsp':
                 return config.UNIFIABLE[c]
             else:
                 return chr(name2cp(c))

예제 #4

0

파일 보기

파일: __init__.py 프로젝트: gaulinmp/html2text

 def entityref(self, c):
     if not self.unicode_snob and c in config.UNIFIABLE.keys():
         return config.UNIFIABLE[c]
     else:
         try:
             name2cp(c)
         except KeyError:
             return "&" + c + ';'
         else:
             if c == 'nbsp':
                 return config.UNIFIABLE[c]
             else:
                 return chr(name2cp(c))

예제 #5

0

파일 보기

파일: __init__.py 프로젝트: hsmett/html2text

 def entityref(self, c):
     if not self.unicode_snob and c in config.UNIFIABLE:
         return config.UNIFIABLE[c]
     else:
         try:
             name2cp(c)
         except KeyError:
             return "&" + c + ";"
         else:
             if c == "nbsp":
                 return config.UNIFIABLE[c]
             else:
                 return chr(name2cp(c))

예제 #6

0

파일 보기

파일: __init__.py 프로젝트: Alir3z4/html2text

 def entityref(self, c):
     if not self.unicode_snob and c in config.UNIFIABLE:
         return config.UNIFIABLE[c]
     else:
         try:
             name2cp(c)
         except KeyError:
             return "&" + c + ";"
         else:
             if c == "nbsp":
                 return config.UNIFIABLE[c]
             else:
                 return chr(name2cp(c))

예제 #7

0

파일 보기

파일: __init__.py 프로젝트: Wysie/html2text

    def close(self):
        HTMLParser.HTMLParser.close(self)

        try:
            nochr = unicode('')
            unicode_character = unichr
        except NameError:
            nochr = str('')
            unicode_character = chr

        self.pbr()
        self.o('', 0, 'end')

        outtext = nochr.join(self.outtextlist)

        if self.unicode_snob:
            nbsp = unicode_character(name2cp('nbsp'))
        else:
            nbsp = unicode_character(32)
        try:
            outtext = outtext.replace(unicode('&nbsp_place_holder;'), nbsp)
        except NameError:
            outtext = outtext.replace('&nbsp_place_holder;', nbsp)

        # Clear self.outtextlist to avoid memory leak of its content to
        # the next handling.
        self.outtextlist = []

        return outtext

예제 #8

0

파일 보기

파일: __init__.py 프로젝트: alexe0110/autoQA

    def close(self):
        HTMLParser.HTMLParser.close(self)

        self.pbr()
        self.o("", force="end")

        outtext = nochr.join(self.outtextlist)

        if self.unicode_snob:
            nbsp = chr(name2cp("nbsp"))
        else:
            nbsp = chr(32)
        outtext = outtext.replace("&nbsp_place_holder;", nbsp)

        # Clear self.outtextlist to avoid memory leak of its content to
        # the next handling.
        self.outtextlist = []

        return outtext

예제 #9

0

파일 보기

파일: __init__.py 프로젝트: PlushBeaver/FanFicFare

    def __init__(self, out=None, baseurl="", bodywidth=config.BODY_WIDTH):
        """
        Input parameters:
            out: possible custom replacement for self.outtextf (which
                 appends lines of text).
            baseurl: base URL of the document we process
        """
        kwargs = {}
        if sys.version_info >= (3, 4):
            kwargs["convert_charrefs"] = False
        HTMLParser.HTMLParser.__init__(self, **kwargs)

        # Config options
        self.split_next_td = False
        self.td_count = 0
        self.table_start = False
        self.unicode_snob = config.UNICODE_SNOB  # covered in cli
        self.escape_snob = config.ESCAPE_SNOB  # covered in cli
        self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
        self.body_width = bodywidth  # covered in cli
        self.skip_internal_links = config.SKIP_INTERNAL_LINKS  # covered in cli
        self.inline_links = config.INLINE_LINKS  # covered in cli
        self.protect_links = config.PROTECT_LINKS  # covered in cli
        self.google_list_indent = config.GOOGLE_LIST_INDENT  # covered in cli
        self.ignore_links = config.IGNORE_ANCHORS  # covered in cli
        self.ignore_images = config.IGNORE_IMAGES  # covered in cli
        self.images_to_alt = config.IMAGES_TO_ALT  # covered in cli
        self.images_with_size = config.IMAGES_WITH_SIZE  # covered in cli
        self.ignore_emphasis = config.IGNORE_EMPHASIS  # covered in cli
        self.bypass_tables = config.BYPASS_TABLES  # covered in cli
        self.google_doc = False  # covered in cli
        self.ul_item_mark = "*"  # covered in cli
        self.emphasis_mark = "_"  # covered in cli
        self.strong_mark = "**"
        self.single_line_break = config.SINGLE_LINE_BREAK  # covered in cli
        self.use_automatic_links = config.USE_AUTOMATIC_LINKS  # covered in cli
        self.hide_strikethrough = False  # covered in cli
        self.mark_code = config.MARK_CODE
        self.wrap_links = config.WRAP_LINKS  # covered in cli
        self.tag_callback = None

        if out is None:  # pragma: no cover
            self.out = self.outtextf
        else:  # pragma: no cover
            self.out = out

        # empty list to store output characters before they are "joined"
        self.outtextlist = []

        self.quiet = 0
        self.p_p = 0  # number of newline character to print before next output
        self.outcount = 0
        self.start = 1
        self.space = 0
        self.a = []
        self.astack = []
        self.maybe_automatic_link = None
        self.empty_link = False
        self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://")
        self.acount = 0
        self.list = []
        self.blockquote = 0
        self.pre = 0
        self.startpre = 0
        self.code = False
        self.br_toggle = ""
        self.lastWasNL = 0
        self.lastWasList = False
        self.style = 0
        self.style_def = {}
        self.tag_stack = []
        self.emphasis = 0
        self.drop_white_space = 0
        self.inheader = False
        self.abbr_title = None  # current abbreviation definition
        self.abbr_data = None  # last inner HTML (for abbr being defined)
        self.abbr_list = {}  # stack of abbreviations to write later
        self.baseurl = baseurl

        try:
            del unifiable_n[name2cp("nbsp")]
        except KeyError:
            pass
        config.UNIFIABLE["nbsp"] = "&nbsp_place_holder;"

예제 #10

0

파일 보기

파일: __init__.py 프로젝트: murrayk-bishops/book

    def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
        """
        Input parameters:
            out: possible custom replacement for self.outtextf (which
                 appends lines of text).
            baseurl: base URL of the document we process
        """
        kwargs = {}
        if sys.version_info >= (3, 4):
            kwargs['convert_charrefs'] = False
        HTMLParser.HTMLParser.__init__(self, **kwargs)

        # Config options
        self.split_next_td = False
        self.td_count = 0
        self.table_start = False
        self.unicode_snob = config.UNICODE_SNOB  # covered in cli
        self.escape_snob = config.ESCAPE_SNOB  # covered in cli
        self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
        self.body_width = bodywidth  # covered in cli
        self.skip_internal_links = config.SKIP_INTERNAL_LINKS  # covered in cli
        self.inline_links = config.INLINE_LINKS  # covered in cli
        self.protect_links = config.PROTECT_LINKS  # covered in cli
        self.google_list_indent = config.GOOGLE_LIST_INDENT  # covered in cli
        self.ignore_links = config.IGNORE_ANCHORS  # covered in cli
        self.ignore_images = config.IGNORE_IMAGES  # covered in cli
        self.images_to_alt = config.IMAGES_TO_ALT  # covered in cli
        self.images_with_size = config.IMAGES_WITH_SIZE  # covered in cli
        self.ignore_emphasis = config.IGNORE_EMPHASIS  # covered in cli
        self.bypass_tables = config.BYPASS_TABLES  # covered in cli
        self.ignore_tables = config.IGNORE_TABLES  # covered in cli
        self.google_doc = False  # covered in cli
        self.ul_item_mark = '*'  # covered in cli
        self.emphasis_mark = '_'  # covered in cli
        self.strong_mark = '**'
        self.single_line_break = config.SINGLE_LINE_BREAK  # covered in cli
        self.use_automatic_links = config.USE_AUTOMATIC_LINKS  # covered in cli
        self.hide_strikethrough = False  # covered in cli
        self.mark_code = config.MARK_CODE
        self.wrap_links = config.WRAP_LINKS  # covered in cli
        self.pad_tables = config.PAD_TABLES  # covered in cli
        self.default_image_alt = config.DEFAULT_IMAGE_ALT  # covered in cli
        self.tag_callback = None

        if out is None:  # pragma: no cover
            self.out = self.outtextf
        else:  # pragma: no cover
            self.out = out

        # empty list to store output characters before they are "joined"
        self.outtextlist = []

        self.quiet = 0
        self.p_p = 0  # number of newline character to print before next output
        self.outcount = 0
        self.start = 1
        self.space = 0
        self.a = []
        self.astack = []
        self.maybe_automatic_link = None
        self.empty_link = False
        self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
        self.acount = 0
        self.list = []
        self.blockquote = 0
        self.pre = 0
        self.startpre = 0
        self.code = False
        self.br_toggle = ''
        self.lastWasNL = 0
        self.lastWasList = False
        self.style = 0
        self.style_def = {}
        self.tag_stack = []
        self.emphasis = 0
        self.drop_white_space = 0
        self.inheader = False
        self.abbr_title = None  # current abbreviation definition
        self.abbr_data = None  # last inner HTML (for abbr being defined)
        self.abbr_list = {}  # stack of abbreviations to write later
        self.baseurl = baseurl
        self.stressed = False
        self.preceding_stressed = False
        self.preceding_data = None
        self.current_tag = None

        try:
            del unifiable_n[name2cp('nbsp')]
        except KeyError:
            pass
        config.UNIFIABLE['nbsp'] = '&nbsp_place_holder;'

예제 #11

0

파일 보기

파일: __init__.py 프로젝트: alawibaba/html2text

    def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
        """
        Input parameters:
            out: possible custom replacement for self.outtextf (which
                 appends lines of text).
            baseurl: base URL of the document we process
        """
        HTMLParser.HTMLParser.__init__(self)

        # Config options
        self.split_next_td = False
        self.td_count = 0
        self.table_start = False
        self.unicode_snob = config.UNICODE_SNOB
        self.escape_snob = config.ESCAPE_SNOB
        self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
        self.body_width = bodywidth
        self.skip_internal_links = config.SKIP_INTERNAL_LINKS
        self.inline_links = config.INLINE_LINKS
        self.protect_links = config.PROTECT_LINKS
        self.google_list_indent = config.GOOGLE_LIST_INDENT
        self.ignore_links = config.IGNORE_ANCHORS
        self.ignore_images = config.IGNORE_IMAGES
        self.images_to_alt = config.IMAGES_TO_ALT
        self.images_with_size = config.IMAGES_WITH_SIZE
        self.ignore_emphasis = config.IGNORE_EMPHASIS
        self.bypass_tables = config.BYPASS_TABLES
        self.google_doc = False
        self.ul_item_mark = '*'
        self.emphasis_mark = '_'
        self.strong_mark = '**'
        self.single_line_break = config.SINGLE_LINE_BREAK

        if out is None:
            self.out = self.outtextf
        else:
            self.out = out

        # empty list to store output characters before they are "joined"
        self.outtextlist = []

        self.quiet = 0
        self.p_p = 0  # number of newline character to print before next output
        self.outcount = 0
        self.start = 1
        self.space = 0
        self.a = []
        self.astack = []
        self.maybe_automatic_link = None
        self.empty_link = False
        self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
        self.acount = 0
        self.list = []
        self.blockquote = 0
        self.pre = 0
        self.startpre = 0
        self.code = False
        self.br_toggle = ''
        self.lastWasNL = 0
        self.lastWasList = False
        self.style = 0
        self.style_def = {}
        self.tag_stack = []
        self.emphasis = 0
        self.drop_white_space = 0
        self.inheader = False
        self.abbr_title = None  # current abbreviation definition
        self.abbr_data = None  # last inner HTML (for abbr being defined)
        self.abbr_list = {}  # stack of abbreviations to write later
        self.baseurl = baseurl

        try:
            del unifiable_n[name2cp('nbsp')]
        except KeyError:
            pass
        config.UNIFIABLE['nbsp'] = '&nbsp_place_holder;'

예제 #12

0

파일 보기

파일: __init__.py 프로젝트: lip365/ebagu0.2

    def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH):
        """
        Input parameters:
            out: possible custom replacement for self.outtextf (which
                 appends lines of text).
            baseurl: base URL of the document we process
        """
        HTMLParser.HTMLParser.__init__(self)

        # Config options
        self.split_next_td = False
        self.td_count = 0
        self.table_start = False
        self.unicode_snob = config.UNICODE_SNOB
        self.escape_snob = config.ESCAPE_SNOB
        self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
        self.body_width = bodywidth
        self.skip_internal_links = config.SKIP_INTERNAL_LINKS
        self.inline_links = config.INLINE_LINKS
        self.protect_links = config.PROTECT_LINKS
        self.google_list_indent = config.GOOGLE_LIST_INDENT
        self.ignore_links = config.IGNORE_ANCHORS
        self.ignore_images = config.IGNORE_IMAGES
        self.images_to_alt = config.IMAGES_TO_ALT
        self.ignore_emphasis = config.IGNORE_EMPHASIS
        self.bypass_tables = config.BYPASS_TABLES
        self.google_doc = False
        self.ul_item_mark = '*'
        self.emphasis_mark = '_'
        self.strong_mark = '**'
        self.single_line_break = config.SINGLE_LINE_BREAK

        if out is None:
            self.out = self.outtextf
        else:
            self.out = out

        # empty list to store output characters before they are "joined"
        self.outtextlist = []

        self.quiet = 0
        self.p_p = 0  # number of newline character to print before next output
        self.outcount = 0
        self.start = 1
        self.space = 0
        self.a = []
        self.astack = []
        self.maybe_automatic_link = None
        self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://')
        self.acount = 0
        self.list = []
        self.blockquote = 0
        self.pre = 0
        self.startpre = 0
        self.code = False
        self.br_toggle = ''
        self.lastWasNL = 0
        self.lastWasList = False
        self.style = 0
        self.style_def = {}
        self.tag_stack = []
        self.emphasis = 0
        self.drop_white_space = 0
        self.inheader = False
        self.abbr_title = None  # current abbreviation definition
        self.abbr_data = None  # last inner HTML (for abbr being defined)
        self.abbr_list = {}  # stack of abbreviations to write later
        self.baseurl = baseurl

        try:
            del unifiable_n[name2cp('nbsp')]
        except KeyError:
            pass
        config.UNIFIABLE['nbsp'] = '&nbsp_place_holder;'