Python html2unicode 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: wikipedia

메소드/함수: html2unicode

hotexamples.com에서의 예제들: 7

Python html2unicode - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 wikipedia.html2unicode에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: cosmetic_changes.py 프로젝트: pyropeter/PyroBot-1G

 def resolveHtmlEntities(self, text):
     ignore = [
          38,     # Ampersand (&amp;)
          60,     # Less than (&lt;)
          62,     # Great than (&gt;)
          91,     # Opening bracket - sometimes used intentionally inside links
          93,     # Closing bracket - sometimes used intentionally inside links
         124,     # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,     # Non-breaking space (&nbsp;) - not supported by Firefox textareas
     ]
     text = wikipedia.html2unicode(text, ignore = ignore)
     return text

예제 #2

파일 보기

파일: cosmetic_changes.py 프로젝트: yknip1207/genewiki

 def resolveHtmlEntities(self, text):
     ignore = [
          38,     # Ampersand (&amp;)
          60,     # Less than (&lt;)
          62,     # Great than (&gt;)
          91,     # Opening bracket - sometimes used intentionally inside links
          93,     # Closing bracket - sometimes used intentionally inside links
         124,     # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,     # Non-breaking space (&nbsp;) - not supported by Firefox textareas
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     if self.site.lang == 'eo':
         ignore += [39]
     text = pywikibot.html2unicode(text, ignore = ignore)
     return text

예제 #3

파일 보기

 def resolveHtmlEntities(self, text):
     ignore = [
         38,  # Ampersand (&amp;)
         60,  # Less than (&lt;)
         62,  # Great than (&gt;)
         91,  # Opening bracket - sometimes used intentionally inside links
         93,  # Closing bracket - sometimes used intentionally inside links
         124,  # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,  # Non-breaking space (&nbsp;) - not supported by Firefox textareas
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     if self.site.lang == 'eo':
         ignore += [39]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text

예제 #4

파일 보기

파일: cosmetic_changes.py 프로젝트: XXN/pywikibot-compat

 def resolveHtmlEntities(self, text):
     ignore = [
         38,     # Ampersand (&amp;)
         39,     # Bugzilla 24093
         60,     # Less than (&lt;)
         62,     # Great than (&gt;)
         91,     # Opening bracket - sometimes used intentionally inside links
         93,     # Closing bracket - sometimes used intentionally inside links
         124,    # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,    # Non-breaking space (&nbsp;) - not supported by Firefox textareas
         173,    # Soft-hypen (&shy;) - enable editing
         8206,   # left-to-right mark (&ltr;)
         8207,   # right-to-left mark (&rtl;)
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     #if self.site.lang == 'eo':
     #    ignore += [39]
     if self.template:
         ignore += [58]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text

예제 #5

파일 보기

    def transform(self, ispdf=False):
        """Normalize the title"""
        #convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        #remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        #remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        #remove extra whitespaces
        #remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        #avoid closing the link before the end
        self.title = self.title.replace(']', '&#93;')
        #avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}&#125;')
        #prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\'&#39;')
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())

예제 #6

파일 보기

파일: reflinks.py 프로젝트: edgarskos/pywikipedia-git

    def transform(self, ispdf = False):
        """Normalize the title"""
        #convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        #remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        #remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        #remove extra whitespaces
        #remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        #avoid closing the link before the end
        self.title = self.title.replace(']', '&#93;')
        #avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}&#125;')
        #prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\'&#39;')
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())

예제 #7

파일 보기

파일: reflinks.py 프로젝트: hroest/pywikibot-compat

    def transform(self, ispdf=False):
        """Normalize the title"""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r"-+", "-", self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r"[\.+\-=]{4,}", " ", self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r"(?u)\s", " ", self.title)
        self.title = re.sub(r"[\n\r\t]", " ", self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r" +", " ", self.title.strip(r"=.;,-+_ "))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace("]", "&#93;")
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace("}}", "}&#125;")
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace("''", "'&#39;")
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())