def copyDependentFilesAndUpdateLinks(oldfile, filename): myanalyzer = analyzer.ContentAnalyzer() myanalyzer.analyzeFile(filename) htmldir = os.path.dirname(oldfile) html = utils.openFile(filename, "r").read() encoding = GetEncoding(html) if encoding == None: encoding = utils.getCurrentEncoding() html = utils.makeUnicode(html, encoding) if not encoding: encoding = utils.guessEncodingForText(text) if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]: html = convNotSoSmartQuotesToHtmlEntity(html) for link in myanalyzer.fileLinks: sourcefile = GetFullPathForURL(link, htmldir) if os.path.exists(sourcefile): sourcedir = os.path.dirname(sourcefile) htmlname = os.path.basename(filename) depName = os.path.basename(link) destLink = u"../File/" + htmlname + "_files/" + depName destdir = os.path.join(settings.ProjectDir, os.path.dirname(destLink[3:].replace("/", os.sep))) if not os.path.exists(destdir): os.makedirs(destdir) result = fileutils.CopyFile(depName, sourcedir, destdir) if result: html = html.replace(link, urllib.quote(destLink)) else: print "unable to copy file: " + sourcefile else: print "cannot find source file: " + sourcefile output = utils.openFile(filename, "w") output.write(html.encode(encoding)) output.close()
def GetBody(myhtml): """ Function: _GetBody(self, myhtml) Last Updated: 9/24/02 Description: Internal function to get the data in between the <BODY></BODY> tags. Arguments: - myhtml: a string containing the HTML page Return values: Returns the data between the <BODY></BODY> tags of the HTML page """ inbody = 0 inscript = 0 bodystart = 0 bodyend = 0 text = "" uppercase = 1 encoding = None htmltext = myhtml.readlines() for html in htmltext: if not encoding and string.find(html.lower(), "<meta") != -1: encoding = GetEncoding(html) #if we're inside a script, mark it so that we can test if body tag is inside the script scriptstart = string.find(html, "<SCRIPT") if scriptstart == -1: scriptstart = string.find(html, "<script") if not string.find(html.lower(), "</script>") == -1: inscript = 0 #check for start of body in upper and lowercase bodystart = string.find(string.lower(html), "<body") #if body is found, mark the end of it if not bodystart == -1: bodystart = string.find(html, ">", bodystart) #if we've found both a body tag and a script tag, find which one comes first #if script is first, this isn't the "real" body tag if (not inbody and bodystart != -1) and scriptstart != -1: if bodystart > scriptstart: inscript = 1 #if we are not in a script, and we've found the body tag, capture the text if inscript == 0 and (not bodystart == -1 or inbody): inbody = 1 bodyend = string.find(string.lower(html), "</body>") #if both <BODY> and </BODY> are on same line, grab it all if not bodystart == -1 and not bodyend == -1: text = text + html[bodystart+1:bodyend] bodystart = -1 bodyend = -1 inbody = 0 elif not bodyend == -1: #if bodyend == 0: # bodyend = 1 #a hack because -1 means everything inbody = 0 text = text + html[0:bodyend] bodyend = -1 elif not bodystart == -1: text = text + html[bodystart+1:-1] bodystart = -1 elif inbody == 1: text = text + html html = myhtml.readline() if not encoding: encoding = utils.guessEncodingForText(text) if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]: text = convNotSoSmartQuotesToHtmlEntity(text) text = utils.makeUnicode(text, encoding, 'xmlcharrefreplace') soup = BeautifulSoup.BeautifulSoup('\n'.join(htmltext)) if soup.html.head: scripts = soup.html.head.findAll('script') scripts.reverse() # since we're prepending, we need to do it in reverse order for script in scripts: text = script + text return text