예제 #1
0
    def unquote(content, encoding='utf-8', errors='replace'):
        """
        Replace %xx escapes by their single-character equivalent. The optional
        encoding and errors parameters specify how to decode percent-encoded
        sequences.

        Wrapper to Python's unquote while remaining compatible with both
        Python 2 & 3 since the reference to this function changed between
        versions.

        Note: errors set to 'replace' means that invalid sequences are
              replaced by a placeholder character.

        Args:
            content (str): The quoted URI string you wish to unquote
            encoding (:obj:`str`, optional): encoding type
            errors (:obj:`str`, errors): how to handle invalid character found
                in encoded string (defined by encoding)

        Returns:
            str: The unquoted URI string
        """
        if not content:
            return ''

        try:
            # Python v3.x
            return _unquote(content, encoding=encoding, errors=errors)

        except TypeError:
            # Python v2.7
            return _unquote(content)
def textExtraction(wikidocument, lang):
    #extract the body part
    body=_body_re.search(wikidocument).group(1)
    
    #list internal links
    internal_links=[(lang, _unquote(url)) for (url, document_name) in _internal_link.findall(body)]
    
    #list interlanguage links
    interlanguage_links=[(lang_ref, _unquote(url)) for (lang_ref, url) in _interlanguage_link.findall(body)]
    
    
    #replace links
    body=_link_re.sub((lambda match: match.group(2)), body)
    
    #supress table toc
    body=_table_toc_re.sub("\n", body)
    
    #supress imgages
    body=_img_re.sub("", body)
    
    #supress scripts
    body=_script_re.sub("", body)
    
    #supress citations
    body=_cite_re.sub("", body)
    
    
    #supress sups
    body=_sup_re.sub((lambda match: match.group(1)), body)
            
    #supress tables
    body=_table_re.sub("\n", body)
            
    ##supress everything after "see also"
    #see_also_re=_re.compile("<h2><span class=\"mw-headline\" id=\"Voir_aussi\">Voir aussi</span></h2>", _re.DOTALL)
    #match=see_also_re.search(body)
    #if match:
        #body=body[:match.start()]
            
    #only keeps p and hx
    body="\n".join(_p_and_hx_re.findall(body))
    
    #remove (formating) tags
    body=_tags_re.sub("", body)
    
    #the following is coding dependant
    body=body.decode("utf8")
    
    #split lines
    body=_end_line_re.sub((lambda match: match.group(0)+"\n"), body)
    
    #encoding normalization
    body=_entity_re.sub(_entity_callback, body)
    
    
    
    return (body.encode("utf8"), internal_links, interlanguage_links)
예제 #3
0
def textExtraction(wikidocument, lang):
    #extract the body part
    body = _body_re.search(wikidocument).group(1)

    #list internal links
    internal_links = [(lang, _unquote(url))
                      for (url, document_name) in _internal_link.findall(body)]

    #list interlanguage links
    interlanguage_links = [(lang_ref, _unquote(url))
                           for (lang_ref,
                                url) in _interlanguage_link.findall(body)]

    #replace links
    body = _link_re.sub((lambda match: match.group(2)), body)

    #supress table toc
    body = _table_toc_re.sub("\n", body)

    #supress imgages
    body = _img_re.sub("", body)

    #supress scripts
    body = _script_re.sub("", body)

    #supress citations
    body = _cite_re.sub("", body)

    #supress sups
    body = _sup_re.sub((lambda match: match.group(1)), body)

    #supress tables
    body = _table_re.sub("\n", body)

    ##supress everything after "see also"
    #see_also_re=_re.compile("<h2><span class=\"mw-headline\" id=\"Voir_aussi\">Voir aussi</span></h2>", _re.DOTALL)
    #match=see_also_re.search(body)
    #if match:
    #body=body[:match.start()]

    #only keeps p and hx
    body = "\n".join(_p_and_hx_re.findall(body))

    #remove (formating) tags
    body = _tags_re.sub("", body)

    #the following is coding dependant
    body = body.decode("utf8")

    #split lines
    body = _end_line_re.sub((lambda match: match.group(0) + "\n"), body)

    #encoding normalization
    body = _entity_re.sub(_entity_callback, body)

    return (body.encode("utf8"), internal_links, interlanguage_links)
예제 #4
0
    def unquote(content, encoding='utf-8', errors='replace'):
        """
        common unquote function

        """
        if not content:
            return ''

        try:
            # Python v3.x
            return _unquote(content, encoding=encoding, errors=errors)

        except TypeError:
            # Python v2.7
            return _unquote(content)
예제 #5
0
파일: common.py 프로젝트: floppym/oauthlib
def unquote(s):
    s = _unquote(s)
    # PY3 always returns unicode.  PY2 seems to always return what you give it,
    # which differs from quote's behavior.  Just to be safe, make sure it is
    # unicode before we return.
    if isinstance(s, bytes_type):
        s = s.decode('utf-8')
    return s
예제 #6
0
def unquote(s):
    s = _unquote(s)
    # PY3 always returns unicode.  PY2 seems to always return what you give it,
    # which differs from quote's behavior.  Just to be safe, make sure it is
    # unicode before we return.
    if isinstance(s, bytes_type):
        s = s.decode('utf-8')
    return s
예제 #7
0
def unquote(s):
    return to_unicode(_unquote(s))
예제 #8
0
파일: site_helper.py 프로젝트: npk/zarkpy
def unquote(string):
    assert (type(string) in [unicode, str])
    return _unquote(string.encode('utf-8')) if isinstance(
        string, unicode) else _unquote(string)
예제 #9
0
def getUrlParams(url=None):
    if url is None:
        url = getEnv('REQUEST_URI')
    url = urlparse(url)
    return dict([(part.split('=')[0], _unquote(part.split('=')[1])) for part in url[4].split('&') if len(part.split('=')) == 2])
예제 #10
0
def unquote(string):
    if type(string) is unicode:
        string = string.encode('utf-8')
    return _unquote(string)
예제 #11
0
파일: chm_input.py 프로젝트: pkuhzx/calibre
 def unquote(x):
     if isinstance(x, unicode_type):
         x = x.encode('utf-8')
     return _unquote(x).decode('utf-8')
예제 #12
0
파일: compat.py 프로젝트: guix77/weboob
 def unquote(s):
     s = _reencode(s)
     return _unquote(s).decode('utf-8')
예제 #13
0
def unquote(*l):
    return tuple(_unquote(unicodeToStr(s))
                 for s in l) if len(l) != 1 else _unquote(unicodeToStr(l[0]))
예제 #14
0
def unquote(s):
    return unicode(_unquote(s.encode("utf-8")), "utf-8")
예제 #15
0
# coding=utf-8
# Copyright 2008-9, Sean B. Palmer, inamidst.com
# Copyright 2012, Elsie Powell, embolalia.com
# Licensed under the Eiffel Forum License 2.
from __future__ import unicode_literals, absolute_import, print_function, division

import re
from sopel import web
from sopel.module import commands, example
import requests
import xmltodict
import sys

if sys.version_info.major < 3:
    from urllib import quote_plus, unquote as _unquote
    unquote = lambda s: _unquote(s.encode('utf-8')).decode('utf-8')
else:
    from urllib.parse import quote_plus, unquote


def formatnumber(n):
    """Format a number with beautiful commas."""
    parts = list(str(n))
    for i in range((len(parts) - 3), 0, -3):
        parts.insert(i, ',')
    return ''.join(parts)


r_bing = re.compile(r'<h2(?: class=" b_topTitle")?><a href="([^"]+)"')

예제 #16
0
def unquote(url):
    if PY3:
        return _unquote(u(url), encoding="utf-8")
    return _unquote(u(url)).decode("utf-8")
예제 #17
0
파일: search.py 프로젝트: sopel-irc/sopel
 def unquote(s):
     return _unquote(s.encode('utf-8')).decode('utf-8')
예제 #18
0
def unquote(string):
    assert(type(string) in [unicode, str])
    return _unquote(string.encode('utf-8')) if isinstance(string, unicode) else _unquote(string)
예제 #19
0
def unquote(s):
    return networkString(_unquote(nativeString(s)))
예제 #20
0
def unquote(s):
    return unicode(_unquote(s.encode("utf-8")), "utf-8")
예제 #21
0
 def unquote(s):
     return _unquote(s.encode('utf-8')).decode('utf-8')
예제 #22
0
 def unquote(data, encoding='utf-8', errors='replace'):
     return _unquote(data).encode('latin1').decode(encoding, errors)
예제 #23
0
def getUrlParams(url=None):
    if url is None: url = getEnv('REQUEST_URI')
    url = urlparse(url)
    return dict([(part.split('=')[0], _unquote(part.split('=')[1]))
                 for part in url[4].split('&') if len(part.split('=')) == 2])
예제 #24
0
# coding=utf-8
# Copyright 2008-9, Sean B. Palmer, inamidst.com
# Copyright 2012, Elsie Powell, embolalia.com
# Licensed under the Eiffel Forum License 2.
from __future__ import unicode_literals, absolute_import, print_function, division

import re
import sys

if sys.version_info.major < 3:
    from urllib import unquote as _unquote
    unquote = lambda s: _unquote(s.encode('utf-8')).decode('utf-8')
else:
    from urllib.parse import unquote

import requests
import xmltodict

from sopel import web
from sopel.module import commands, example


def formatnumber(n):
    """Format a number with beautiful commas."""
    parts = list(str(n))
    for i in range((len(parts) - 3), 0, -3):
        parts.insert(i, ',')
    return ''.join(parts)


r_bing = re.compile(r'<h2(?: class=" b_topTitle")?><a href="([^"]+)"')
예제 #25
0
def unquote(*l):
    return tuple(_unquote(unicodeToStr(s)) for s in l) if len(l) != 1 else _unquote(unicodeToStr(l[0]))
예제 #26
0
 def unquote_to_bytes(data):
     if isinstance(data, unicode):
         data = data.encode('ascii')
     return _unquote(data)
예제 #27
0
def unquote(s):
    if isinstance(s, bytes):
        s = s.decode("ascii")
    quoted = _unquote(s)
    return quoted.encode("ascii")
예제 #28
0
 def unquote(x):
     if isinstance(x, unicode):
         x = x.encode('utf-8')
     return _unquote(x).decode('utf-8')
예제 #29
0
def unquote(s):
    return networkString(_unquote(nativeString(s)))
예제 #30
0
 def unquote(value, encoding, errors):
     return _unquote(value).decode(encoding, errors)