Python makeUnicode примеры, utils.makeUnicode Python примеры использования

Пример #1

0

Показать файл

Файл: htmlutils.py Проект: kollivier/brightwriter

def getUnicodeHTMLForFile(filename):
    html = open(filename, "rb").read()
    encoding = GetEncoding(html)
    if not encoding:
        encoding = ""
        
    return utils.makeUnicode(html, encoding)

Пример #2

0

Показать файл

 def normalize(name):
     """
     Normalize a page name's case, spaces, and such, but do not
     encode any special characters--this must be a repeatable transform,
     having no effect on a name already normalized.  This is done to
     names that should already be mangled for use as URLs so that they
     will map to the same database key and such.  The mangle function
     below should be used on human-readable names.
     """
     return removeEscapes(makeUnicode(name).lstrip().rstrip().lower().replace(u" ", u"_"))

Пример #3

0

Показать файл

Файл: dom.py Проект: lcrocker/ewc

 def __setitem__(self, key, val):
     if "class" == key:
         self.classes = val.split(u" ")
     elif "style" == key:
         styles = val.split(u";")
         for s in styles:
             if not s:
                 continue
             spl = s.split(":", 1)
             if 1 == len(spl):
                 spl.append("")
             self.styles[spl[0].lstrip().rstrip()] = \
             (spl[1].lstrip().rstrip()).decode()
     else:
         self.map[key] = makeUnicode(val)

Пример #4

0

Показать файл

def getPrefix(line):
    """
    If line begins with a colon-separated namespace prefix, return it
    and remove it for further processing.
    
    >>> from namespaces import getPrefix
    >>> getPrefix("")
    (u'', u'')
    >>> getPrefix(":")
    (u'', u'')
    >>> getPrefix(":abc")
    (u'', u'abc')
    >>> getPrefix("::abc")
    (u'', u':abc')
    >>> getPrefix("abc")
    (u'', u'abc')
    >>> getPrefix("abc:")
    (u'abc', u'')
    >>> getPrefix("abc:def")
    (u'abc', u'def')
    >>> getPrefix("abc:def:ghi")
    (u'abc', u'def:ghi')
    >>> getPrefix("abc::")
    (u'abc', u':')
    """
    line = makeUnicode(line.lstrip())

    if not line:
        return u"", line
    if u":" == line[0]:
        return u"", line[1:]
    if not line[0].isalpha():
        return u"", line

    ns = [line[0]]
    i = 1
    while True:
        if i >= len(line):
            return u"", line
        if u":" == line[i]:
            return u"".join(ns), line[i+1:]
        elif (line[i].isalnum()) or (line[i] in u"-_"):
            ns.append(line[i])
        else:
            return u"", line
        i += 1

Пример #5

0

Показать файл

    def demangle(name):
        """
        Given a name mangled as above, render it in a more readable form.

        >>> from utils import makeUnicode, removeEscapes
        >>> import namespaces
        >>> namespaces.Local.demangle("a_page_title")
        u'A page title'
        >>> namespaces.Local.demangle("2$3a_a_10$25_$245_b_c")
        u'2: a 10% $5 b c'
        """
        v = list(makeUnicode(name))
        for i in xrange(len(name)):
            if v[i] == u"_":
                v[i] = u" "
            elif v[i] == u"$" and i < len(name)-2:
                v[i] = unichr(16 * int(v[i+1], 16) + int(v[i+2], 16))
                v[i+1] = u"\u0000"
                v[i+2] = u"\u0000"
        s = u"".join(v)
        s = removeEscapes(s)
        return s.capitalize()

Пример #6

0

Показать файл

Файл: htmlutils.py Проект: kollivier/brightwriter

def copyDependentFilesAndUpdateLinks(oldfile, filename):
    myanalyzer = analyzer.ContentAnalyzer()
    myanalyzer.analyzeFile(filename)
    htmldir = os.path.dirname(oldfile)
    html = utils.openFile(filename, "r").read()
    encoding = GetEncoding(html)
    if encoding == None:
        encoding = utils.getCurrentEncoding()
        
    html = utils.makeUnicode(html, encoding)
    
    if not encoding:
        encoding = utils.guessEncodingForText(text)
    
    if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]:
        html = convNotSoSmartQuotesToHtmlEntity(html)
    
    for link in myanalyzer.fileLinks:
        sourcefile = GetFullPathForURL(link, htmldir)
        
        if os.path.exists(sourcefile):
            sourcedir = os.path.dirname(sourcefile)
            htmlname = os.path.basename(filename)
            depName = os.path.basename(link)
            destLink = u"../File/" + htmlname + "_files/" + depName
            destdir = os.path.join(settings.ProjectDir, os.path.dirname(destLink[3:].replace("/", os.sep)))
            if not os.path.exists(destdir):
                os.makedirs(destdir)
            result = fileutils.CopyFile(depName, sourcedir, destdir)
            if result:
                html = html.replace(link, urllib.quote(destLink))
            else:
                print "unable to copy file: " + sourcefile
        else:
            print "cannot find source file: " + sourcefile
                
    output = utils.openFile(filename, "w")
    output.write(html.encode(encoding))
    output.close()

Пример #7

0

Показать файл

Файл: conman.py Проект: kollivier/brightwriter

 def __setattr__(self, name, value):
     # make sure internally we're always using Unicode
     if not name == "encoding":
         self.__dict__[name] = utils.makeUnicode(value, self.encoding)
     else:
         self.__dict__[name] = value

Пример #8

0

Показать файл

Файл: dom.py Проект: lcrocker/ewc

 def _setvalue(self, text):
     self._value = makeUnicode(text)

Пример #9

0

Показать файл

Файл: dom.py Проект: lcrocker/ewc

 def __init__(self, parent=None, val=u""):
     if self.__class__ is CharacterData:
         raise NotImplementedError
     Node.__init__(self, parent)
     self._value = makeUnicode(val)

Пример #10

0

Показать файл

 def imageURL(self, title):
     return makeUnicode(config.localImagePattern) % Local.mangle(title)

Пример #11

0

Показать файл

 def linkURL(self, title):
     return makeUnicode(config.localLinkPattern) % Local.mangle(title)

Пример #12

0

Показать файл

Файл: htmlutils.py Проект: kollivier/brightwriter

def GetBody(myhtml):
    """
    Function: _GetBody(self, myhtml)
    Last Updated: 9/24/02
    Description: Internal function to get the data in between the <BODY></BODY> tags.

    Arguments:
    - myhtml: a string containing the HTML page

    Return values:
    Returns the data between the <BODY></BODY> tags of the HTML page
            """
    inbody = 0
    inscript = 0
    bodystart = 0
    bodyend = 0
    text = ""
    uppercase = 1
    encoding = None
    htmltext = myhtml.readlines()
    for html in htmltext:
        if not encoding and string.find(html.lower(), "<meta") != -1:
            encoding = GetEncoding(html)
        #if we're inside a script, mark it so that we can test if body tag is inside the script
        scriptstart = string.find(html, "<SCRIPT")
        if scriptstart == -1:
            scriptstart = string.find(html, "<script")

        if not string.find(html.lower(), "</script>") == -1:
            inscript = 0

        #check for start of body in upper and lowercase
        bodystart = string.find(string.lower(html), "<body")

        #if body is found, mark the end of it
        if not bodystart == -1:
            bodystart = string.find(html, ">", bodystart)

        #if we've found both a body tag and a script tag, find which one comes first
        #if script is first, this isn't the "real" body tag
        if (not inbody and bodystart != -1) and scriptstart != -1:
            if bodystart > scriptstart:
                inscript = 1

        #if we are not in a script, and we've found the body tag, capture the text
        if inscript == 0 and (not bodystart == -1 or inbody):
            inbody = 1
            bodyend = string.find(string.lower(html), "</body>")
                
            #if both <BODY> and </BODY> are on same line, grab it all
            if not bodystart == -1 and not bodyend == -1:
                text = text + html[bodystart+1:bodyend]
                bodystart = -1
                bodyend = -1
                inbody = 0
            elif not bodyend == -1:
                #if bodyend == 0:
                #   bodyend = 1 #a hack because -1 means everything
                inbody = 0
                text = text + html[0:bodyend] 
                bodyend = -1
            elif not bodystart == -1:
                text = text + html[bodystart+1:-1] 
                bodystart = -1
            elif inbody == 1:
                text = text + html
        html = myhtml.readline()
    
    if not encoding:
        encoding = utils.guessEncodingForText(text)
    
    if encoding and encoding.lower() in ["windows-1252", "iso-8859-1", "iso-8859-2"]:
        text = convNotSoSmartQuotesToHtmlEntity(text)
    
    text = utils.makeUnicode(text, encoding, 'xmlcharrefreplace')
    
    soup = BeautifulSoup.BeautifulSoup('\n'.join(htmltext))
    if soup.html.head:
        scripts = soup.html.head.findAll('script')
        scripts.reverse() # since we're prepending, we need to do it in reverse order
        for script in scripts:
            text = script + text
    
    return text

Python makeUnicode примеры использования