예제 #1
0
 def resolveHtmlEntities(self, text):
     ignore = [
          38,     # Ampersand (&)
          60,     # Less than (<)
          62,     # Great than (>)
          91,     # Opening bracket - sometimes used intentionally inside links
          93,     # Closing bracket - sometimes used intentionally inside links
         124,     # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,     # Non-breaking space ( ) - not supported by Firefox textareas
     ]
     text = wikipedia.html2unicode(text, ignore = ignore)
     return text
예제 #2
0
 def resolveHtmlEntities(self, text):
     ignore = [
          38,     # Ampersand (&)
          60,     # Less than (<)
          62,     # Great than (>)
          91,     # Opening bracket - sometimes used intentionally inside links
          93,     # Closing bracket - sometimes used intentionally inside links
         124,     # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,     # Non-breaking space ( ) - not supported by Firefox textareas
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     if self.site.lang == 'eo':
         ignore += [39]
     text = pywikibot.html2unicode(text, ignore = ignore)
     return text
예제 #3
0
 def resolveHtmlEntities(self, text):
     ignore = [
         38,  # Ampersand (&)
         60,  # Less than (<)
         62,  # Great than (>)
         91,  # Opening bracket - sometimes used intentionally inside links
         93,  # Closing bracket - sometimes used intentionally inside links
         124,  # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,  # Non-breaking space ( ) - not supported by Firefox textareas
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     if self.site.lang == 'eo':
         ignore += [39]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text
예제 #4
0
 def resolveHtmlEntities(self, text):
     ignore = [
         38,     # Ampersand (&)
         39,     # Bugzilla 24093
         60,     # Less than (<)
         62,     # Great than (>)
         91,     # Opening bracket - sometimes used intentionally inside links
         93,     # Closing bracket - sometimes used intentionally inside links
         124,    # Vertical bar (??) - used intentionally in navigation bar templates on de:
         160,    # Non-breaking space ( ) - not supported by Firefox textareas
         173,    # Soft-hypen (­) - enable editing
         8206,   # left-to-right mark (&ltr;)
         8207,   # right-to-left mark (&rtl;)
     ]
     # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801
     #if self.site.lang == 'eo':
     #    ignore += [39]
     if self.template:
         ignore += [58]
     text = pywikibot.html2unicode(text, ignore=ignore)
     return text
예제 #5
0
    def transform(self, ispdf=False):
        """Normalize the title"""
        #convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        #remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        #remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        #remove extra whitespaces
        #remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        #avoid closing the link before the end
        self.title = self.title.replace(']', ']')
        #avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}}')
        #prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\''')
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
예제 #6
0
    def transform(self, ispdf = False):
        """Normalize the title"""
        #convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r'-+', '-', self.title)
        #remove formatting, i.e long useless strings
        self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
        #remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r'(?u)\s', ' ', self.title)
        self.title = re.sub(r'[\n\r\t]', ' ', self.title)
        #remove extra whitespaces
        #remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))

        self.avoid_uppercase()
        #avoid closing the link before the end
        self.title = self.title.replace(']', ']')
        #avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace('}}', '}}')
        #prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace('\'\'', '\''')
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())
예제 #7
0
    def transform(self, ispdf=False):
        """Normalize the title"""
        # convert html entities
        if not ispdf:
            self.title = pywikibot.html2unicode(self.title)
        self.title = re.sub(r"-+", "-", self.title)
        # remove formatting, i.e long useless strings
        self.title = re.sub(r"[\.+\-=]{4,}", " ", self.title)
        # remove \n and \r and Unicode spaces from titles
        self.title = re.sub(r"(?u)\s", " ", self.title)
        self.title = re.sub(r"[\n\r\t]", " ", self.title)
        # remove extra whitespaces
        # remove leading and trailing ./;/,/-/_/+/ /
        self.title = re.sub(r" +", " ", self.title.strip(r"=.;,-+_ "))

        self.avoid_uppercase()
        # avoid closing the link before the end
        self.title = self.title.replace("]", "]")
        # avoid multiple } being interpreted as a template inclusion
        self.title = self.title.replace("}}", "}}")
        # prevent multiple quotes being interpreted as '' or '''
        self.title = self.title.replace("''", "''")
        self.title = pywikibot.unicode2html(self.title, self.site.encoding())