def getUnicodeHTMLForFile(filename): html = open(filename, "rb").read() encoding = GetEncoding(html) if not encoding: encoding = "" return utils.makeUnicode(html, encoding)
def normalize(name): """ Normalize a page name's case, spaces, and such, but do not encode any special characters--this must be a repeatable transform, having no effect on a name already normalized. This is done to names that should already be mangled for use as URLs so that they will map to the same database key and such. The mangle function below should be used on human-readable names. """ return removeEscapes(makeUnicode(name).lstrip().rstrip().lower().replace(u" ", u"_"))
def __setitem__(self, key, val): if "class" == key: self.classes = val.split(u" ") elif "style" == key: styles = val.split(u";") for s in styles: if not s: continue spl = s.split(":", 1) if 1 == len(spl): spl.append("") self.styles[spl[0].lstrip().rstrip()] = \ (spl[1].lstrip().rstrip()).decode() else: self.map[key] = makeUnicode(val)
def getPrefix(line): """ If line begins with a colon-separated namespace prefix, return it and remove it for further processing. >>> from namespaces import getPrefix >>> getPrefix("") (u'', u'') >>> getPrefix(":") (u'', u'') >>> getPrefix(":abc") (u'', u'abc') >>> getPrefix("::abc") (u'', u':abc') >>> getPrefix("abc") (u'', u'abc') >>> getPrefix("abc:") (u'abc', u'') >>> getPrefix("abc:def") (u'abc', u'def') >>> getPrefix("abc:def:ghi") (u'abc', u'def:ghi') >>> getPrefix("abc::") (u'abc', u':') """ line = makeUnicode(line.lstrip()) if not line: return u"", line if u":" == line[0]: return u"", line[1:] if not line[0].isalpha(): return u"", line ns = [line[0]] i = 1 while True: if i >= len(line): return u"", line if u":" == line[i]: return u"".join(ns), line[i+1:] elif (line[i].isalnum()) or (line[i] in u"-_"): ns.append(line[i]) else: return u"", line i += 1
def demangle(name): """ Given a name mangled as above, render it in a more readable form. >>> from utils import makeUnicode, removeEscapes >>> import namespaces >>> namespaces.Local.demangle("a_page_title") u'A page title' >>> namespaces.Local.demangle("2$3a_a_10$25_$245_b_c") u'2: a 10% $5 b c' """ v = list(makeUnicode(name)) for i in xrange(len(name)): if v[i] == u"_": v[i] = u" " elif v[i] == u"$" and i < len(name)-2: v[i] = unichr(16 * int(v[i+1], 16) + int(v[i+2], 16)) v[i+1] = u"\u0000" v[i+2] = u"\u0000" s = u"".join(v) s = removeEscapes(s) return s.capitalize()
def copyDependentFilesAndUpdateLinks(oldfile, filename): myanalyzer = analyzer.ContentAnalyzer() myanalyzer.analyzeFile(filename) htmldir = os.path.dirname(oldfile) html = utils.openFile(filename, "r").read() encoding = GetEncoding(html) if encoding == None: encoding = utils.getCurrentEncoding() html = utils.makeUnicode(html, encoding) if not encoding: encoding = utils.guessEncodingForText(text) if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]: html = convNotSoSmartQuotesToHtmlEntity(html) for link in myanalyzer.fileLinks: sourcefile = GetFullPathForURL(link, htmldir) if os.path.exists(sourcefile): sourcedir = os.path.dirname(sourcefile) htmlname = os.path.basename(filename) depName = os.path.basename(link) destLink = u"../File/" + htmlname + "_files/" + depName destdir = os.path.join(settings.ProjectDir, os.path.dirname(destLink[3:].replace("/", os.sep))) if not os.path.exists(destdir): os.makedirs(destdir) result = fileutils.CopyFile(depName, sourcedir, destdir) if result: html = html.replace(link, urllib.quote(destLink)) else: print "unable to copy file: " + sourcefile else: print "cannot find source file: " + sourcefile output = utils.openFile(filename, "w") output.write(html.encode(encoding)) output.close()
def __setattr__(self, name, value): # make sure internally we're always using Unicode if not name == "encoding": self.__dict__[name] = utils.makeUnicode(value, self.encoding) else: self.__dict__[name] = value
def _setvalue(self, text): self._value = makeUnicode(text)
def __init__(self, parent=None, val=u""): if self.__class__ is CharacterData: raise NotImplementedError Node.__init__(self, parent) self._value = makeUnicode(val)
def imageURL(self, title): return makeUnicode(config.localImagePattern) % Local.mangle(title)
def linkURL(self, title): return makeUnicode(config.localLinkPattern) % Local.mangle(title)
def GetBody(myhtml): """ Function: _GetBody(self, myhtml) Last Updated: 9/24/02 Description: Internal function to get the data in between the <BODY></BODY> tags. Arguments: - myhtml: a string containing the HTML page Return values: Returns the data between the <BODY></BODY> tags of the HTML page """ inbody = 0 inscript = 0 bodystart = 0 bodyend = 0 text = "" uppercase = 1 encoding = None htmltext = myhtml.readlines() for html in htmltext: if not encoding and string.find(html.lower(), "<meta") != -1: encoding = GetEncoding(html) #if we're inside a script, mark it so that we can test if body tag is inside the script scriptstart = string.find(html, "<SCRIPT") if scriptstart == -1: scriptstart = string.find(html, "<script") if not string.find(html.lower(), "</script>") == -1: inscript = 0 #check for start of body in upper and lowercase bodystart = string.find(string.lower(html), "<body") #if body is found, mark the end of it if not bodystart == -1: bodystart = string.find(html, ">", bodystart) #if we've found both a body tag and a script tag, find which one comes first #if script is first, this isn't the "real" body tag if (not inbody and bodystart != -1) and scriptstart != -1: if bodystart > scriptstart: inscript = 1 #if we are not in a script, and we've found the body tag, capture the text if inscript == 0 and (not bodystart == -1 or inbody): inbody = 1 bodyend = string.find(string.lower(html), "</body>") #if both <BODY> and </BODY> are on same line, grab it all if not bodystart == -1 and not bodyend == -1: text = text + html[bodystart+1:bodyend] bodystart = -1 bodyend = -1 inbody = 0 elif not bodyend == -1: #if bodyend == 0: # bodyend = 1 #a hack because -1 means everything inbody = 0 text = text + html[0:bodyend] bodyend = -1 elif not bodystart == -1: text = text + html[bodystart+1:-1] bodystart = -1 elif inbody == 1: text = text + html html = myhtml.readline() if not encoding: encoding = utils.guessEncodingForText(text) if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]: text = convNotSoSmartQuotesToHtmlEntity(text) text = utils.makeUnicode(text, encoding, 'xmlcharrefreplace') soup = BeautifulSoup.BeautifulSoup('\n'.join(htmltext)) if soup.html.head: scripts = soup.html.head.findAll('script') scripts.reverse() # since we're prepending, we need to do it in reverse order for script in scripts: text = script + text return text