Exemplo n.º 1
0
def copyDependentFilesAndUpdateLinks(oldfile, filename):
    myanalyzer = analyzer.ContentAnalyzer()
    myanalyzer.analyzeFile(filename)
    htmldir = os.path.dirname(oldfile)
    html = utils.openFile(filename, "r").read()
    encoding = GetEncoding(html)
    if encoding == None:
        encoding = utils.getCurrentEncoding()
        
    html = utils.makeUnicode(html, encoding)
    
    if not encoding:
        encoding = utils.guessEncodingForText(text)
    
    if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]:
        html = convNotSoSmartQuotesToHtmlEntity(html)
    
    for link in myanalyzer.fileLinks:
        sourcefile = GetFullPathForURL(link, htmldir)
        
        if os.path.exists(sourcefile):
            sourcedir = os.path.dirname(sourcefile)
            htmlname = os.path.basename(filename)
            depName = os.path.basename(link)
            destLink = u"../File/" + htmlname + "_files/" + depName
            destdir = os.path.join(settings.ProjectDir, os.path.dirname(destLink[3:].replace("/", os.sep)))
            if not os.path.exists(destdir):
                os.makedirs(destdir)
            result = fileutils.CopyFile(depName, sourcedir, destdir)
            if result:
                html = html.replace(link, urllib.quote(destLink))
            else:
                print "unable to copy file: " + sourcefile
        else:
            print "cannot find source file: " + sourcefile
                
    output = utils.openFile(filename, "w")
    output.write(html.encode(encoding))
    output.close()
Exemplo n.º 2
0
def GetBody(myhtml):
    """
    Function: _GetBody(self, myhtml)
    Last Updated: 9/24/02
    Description: Internal function to get the data in between the <BODY></BODY> tags.

    Arguments:
    - myhtml: a string containing the HTML page

    Return values:
    Returns the data between the <BODY></BODY> tags of the HTML page
            """
    inbody = 0
    inscript = 0
    bodystart = 0
    bodyend = 0
    text = ""
    uppercase = 1
    encoding = None
    htmltext = myhtml.readlines()
    for html in htmltext:
        if not encoding and string.find(html.lower(), "<meta") != -1:
            encoding = GetEncoding(html)
        #if we're inside a script, mark it so that we can test if body tag is inside the script
        scriptstart = string.find(html, "<SCRIPT")
        if scriptstart == -1:
            scriptstart = string.find(html, "<script")

        if not string.find(html.lower(), "</script>") == -1:
            inscript = 0

        #check for start of body in upper and lowercase
        bodystart = string.find(string.lower(html), "<body")

        #if body is found, mark the end of it
        if not bodystart == -1:
            bodystart = string.find(html, ">", bodystart)

        #if we've found both a body tag and a script tag, find which one comes first
        #if script is first, this isn't the "real" body tag
        if (not inbody and bodystart != -1) and scriptstart != -1:
            if bodystart > scriptstart:
                inscript = 1

        #if we are not in a script, and we've found the body tag, capture the text
        if inscript == 0 and (not bodystart == -1 or inbody):
            inbody = 1
            bodyend = string.find(string.lower(html), "</body>")
                
            #if both <BODY> and </BODY> are on same line, grab it all
            if not bodystart == -1 and not bodyend == -1:
                text = text + html[bodystart+1:bodyend]
                bodystart = -1
                bodyend = -1
                inbody = 0
            elif not bodyend == -1:
                #if bodyend == 0:
                #   bodyend = 1 #a hack because -1 means everything
                inbody = 0
                text = text + html[0:bodyend] 
                bodyend = -1
            elif not bodystart == -1:
                text = text + html[bodystart+1:-1] 
                bodystart = -1
            elif inbody == 1:
                text = text + html
        html = myhtml.readline()
    
    if not encoding:
        encoding = utils.guessEncodingForText(text)
    
    if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]:
        text = convNotSoSmartQuotesToHtmlEntity(text)
    
    text = utils.makeUnicode(text, encoding, 'xmlcharrefreplace')
    
    soup = BeautifulSoup.BeautifulSoup('\n'.join(htmltext))
    if soup.html.head:
        scripts = soup.html.head.findAll('script')
        scripts.reverse() # since we're prepending, we need to do it in reverse order
        for script in scripts:
            text = script + text
    
    return text