def resolveHtmlEntities(self, text): ignore = [ 38, # Ampersand (&) 60, # Less than (<) 62, # Great than (>) 91, # Opening bracket - sometimes used intentionally inside links 93, # Closing bracket - sometimes used intentionally inside links 124, # Vertical bar (??) - used intentionally in navigation bar templates on de: 160, # Non-breaking space ( ) - not supported by Firefox textareas ] text = wikipedia.html2unicode(text, ignore = ignore) return text
def resolveHtmlEntities(self, text): ignore = [ 38, # Ampersand (&) 60, # Less than (<) 62, # Great than (>) 91, # Opening bracket - sometimes used intentionally inside links 93, # Closing bracket - sometimes used intentionally inside links 124, # Vertical bar (??) - used intentionally in navigation bar templates on de: 160, # Non-breaking space ( ) - not supported by Firefox textareas ] # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801 if self.site.lang == 'eo': ignore += [39] text = pywikibot.html2unicode(text, ignore = ignore) return text
def resolveHtmlEntities(self, text): ignore = [ 38, # Ampersand (&) 60, # Less than (<) 62, # Great than (>) 91, # Opening bracket - sometimes used intentionally inside links 93, # Closing bracket - sometimes used intentionally inside links 124, # Vertical bar (??) - used intentionally in navigation bar templates on de: 160, # Non-breaking space ( ) - not supported by Firefox textareas ] # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801 if self.site.lang == 'eo': ignore += [39] text = pywikibot.html2unicode(text, ignore=ignore) return text
def resolveHtmlEntities(self, text): ignore = [ 38, # Ampersand (&) 39, # Bugzilla 24093 60, # Less than (<) 62, # Great than (>) 91, # Opening bracket - sometimes used intentionally inside links 93, # Closing bracket - sometimes used intentionally inside links 124, # Vertical bar (??) - used intentionally in navigation bar templates on de: 160, # Non-breaking space ( ) - not supported by Firefox textareas 173, # Soft-hypen (­) - enable editing 8206, # left-to-right mark (<r;) 8207, # right-to-left mark (&rtl;) ] # ignore ' see http://eo.wikipedia.org/w/index.php?title=Liberec&diff=next&oldid=2320801 #if self.site.lang == 'eo': # ignore += [39] if self.template: ignore += [58] text = pywikibot.html2unicode(text, ignore=ignore) return text
def transform(self, ispdf=False): """Normalize the title""" #convert html entities if not ispdf: self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r'-+', '-', self.title) #remove formatting, i.e long useless strings self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title) #remove \n and \r and Unicode spaces from titles self.title = re.sub(r'(?u)\s', ' ', self.title) self.title = re.sub(r'[\n\r\t]', ' ', self.title) #remove extra whitespaces #remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ ')) self.avoid_uppercase() #avoid closing the link before the end self.title = self.title.replace(']', ']') #avoid multiple } being interpreted as a template inclusion self.title = self.title.replace('}}', '}}') #prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace('\'\'', '\''') self.title = pywikibot.unicode2html(self.title, self.site.encoding())
def transform(self, ispdf = False): """Normalize the title""" #convert html entities if not ispdf: self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r'-+', '-', self.title) #remove formatting, i.e long useless strings self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title) #remove \n and \r and Unicode spaces from titles self.title = re.sub(r'(?u)\s', ' ', self.title) self.title = re.sub(r'[\n\r\t]', ' ', self.title) #remove extra whitespaces #remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ ')) self.avoid_uppercase() #avoid closing the link before the end self.title = self.title.replace(']', ']') #avoid multiple } being interpreted as a template inclusion self.title = self.title.replace('}}', '}}') #prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace('\'\'', '\''') self.title = pywikibot.unicode2html(self.title, self.site.encoding())
def transform(self, ispdf=False): """Normalize the title""" # convert html entities if not ispdf: self.title = pywikibot.html2unicode(self.title) self.title = re.sub(r"-+", "-", self.title) # remove formatting, i.e long useless strings self.title = re.sub(r"[\.+\-=]{4,}", " ", self.title) # remove \n and \r and Unicode spaces from titles self.title = re.sub(r"(?u)\s", " ", self.title) self.title = re.sub(r"[\n\r\t]", " ", self.title) # remove extra whitespaces # remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r" +", " ", self.title.strip(r"=.;,-+_ ")) self.avoid_uppercase() # avoid closing the link before the end self.title = self.title.replace("]", "]") # avoid multiple } being interpreted as a template inclusion self.title = self.title.replace("}}", "}}") # prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace("''", "''") self.title = pywikibot.unicode2html(self.title, self.site.encoding())