Exemplo n.º 1
0
def decode_url(url):
    """
    Take in a URL that is percent-encoded for use in a format such as HTML or
    N-triples, and convert it to a Unicode URL.

    If the URL is contained in angle brackets because it comes from an
    N-triples file, strip those.

    >>> decode_url('<http://dbpedia.org/resource/N%C3%BAria_Espert>')
    'http://dbpedia.org/resource/Núria_Espert'
    """
    url_bytes = url.strip('<>').encode('utf-8')
    text = unquote(url_bytes).decode('utf-8', 'replace')
    try:
        return decode_escapes(text)
    except UnicodeDecodeError:
        return text
Exemplo n.º 2
0
def decode_url(url):
    """
    Take in a URL that is percent-encoded for use in a format such as HTML or
    N-triples, and convert it to a Unicode URL.

    If the URL is contained in angle brackets because it comes from an
    N-triples file, strip those.

    >>> decode_url('<http://dbpedia.org/resource/N%C3%BAria_Espert>')
    'http://dbpedia.org/resource/Núria_Espert'
    """
    url_bytes = url.strip('<>').encode('utf-8')
    text = unquote(url_bytes).decode('utf-8', 'replace')
    try:
        return decode_escapes(text)
    except UnicodeDecodeError:
        return text
def clean_string(s):
    s = str(s)
    if isnull(s):
        return None
    elif re.search('[a-zA-Z]', s) is None:
        return None
    else:
        s = remove_bom(s)
        s = remove_control_chars(s)
        s = fix_encoding(s)
        s = fix_text(s)
        s = fix_partial_utf8_punct_in_1252(s)
        s = decode_escapes(s)
        s = fix_latin_ligatures(s)
        s = uncurl_quotes(s)
        s = s.replace("Äu0087", "ć")
        s = s.replace("Äu0090", "Đ")
        s = s.replace("Ãu0096", "Ö")
        s = s.replace("Åu008D", "ō")

        s = s.replace("\\", " ")
        s = s.replace("/", " ")
        s = s.replace("ö", "ö")

        p = re.compile("^\w+[A-Z]{1}\w*$")
        if p.search(s):
            # From: https://stackoverflow.com/a/37697078
            s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s)

        new_string = ""
        p = False
        for letter in s:
            if letter in "([":
                p = True
            elif letter in ")]":
                p = False
                continue
            if not p:
                new_string += letter
        return new_string.strip()
Exemplo n.º 4
0
def parse_nquads_line(line):
    """
    Parse a line in N-Triples or N-Quads format, returning four dictionaries:
    (subj, pred, obj, graph).

    Each of the dictionaries contains fields that may or may not be present,
    indicating their parsed content:

        - 'url': a complete URL indicating a resource. (Pedants: It's an IRI,
          but it's also a URL.)
        - 'text': a string value.
        - 'lang': the language code associated with the given 'text'.
        - 'type': a URL pointing to something in the 'xsd:' namespace,
          indicating for how to interpret the given 'text' as a value.
        - 'blank': the arbitrary ID of a blank node.
    """
    items = []
    for match in NQUADS_ITEM_RE.finditer(line):
        item = {}
        for group in ['url', 'text', 'lang', 'type', 'blank', 'comment']:
            matched = match.group(group)
            if matched is not None:
                item[group] = matched
        if 'comment' in item:
            continue
        if 'url' in item:
            item['url'] = decode_url(item['url'])
        if 'lang' in item:
            item['lang'] = langcodes.standardize_tag(item['lang'])
        if 'type' in item:
            item['type'] = decode_url(item['type'])
        if 'text' in item:
            item['text'] = decode_escapes(item['text'])
        if item:
            items.append(item)
    if len(items) == 3:
        items.append({})
    # The line is either empty aside from comments, or contains a quad
    assert len(items) == 0 or len(items) == 4, line
    return items
Exemplo n.º 5
0
def parse_nquads_line(line):
    """
    Parse a line in N-Triples or N-Quads format, returning four dictionaries:
    (subj, pred, obj, graph).

    Each of the dictionaries contains fields that may or may not be present,
    indicating their parsed content:

        - 'url': a complete URL indicating a resource. (Pedants: It's an IRI,
          but it's also a URL.)
        - 'text': a string value.
        - 'lang': the language code associated with the given 'text'.
        - 'type': a URL pointing to something in the 'xsd:' namespace,
          indicating for how to interpret the given 'text' as a value.
        - 'blank': the arbitrary ID of a blank node.
    """
    items = []
    for match in NQUADS_ITEM_RE.finditer(line):
        item = {}
        for group in ['url', 'text', 'lang', 'type', 'blank', 'comment']:
            matched = match.group(group)
            if matched is not None:
                item[group] = matched
        if 'comment' in item:
            continue
        if 'url' in item:
            item['url'] = decode_url(item['url'])
        if 'lang' in item:
            item['lang'] = langcodes.standardize_tag(item['lang'])
        if 'type' in item:
            item['type'] = decode_url(item['type'])
        if 'text' in item:
            item['text'] = decode_escapes(item['text'])
        if item:
            items.append(item)
    if len(items) == 3:
        items.append({})
    # The line is either empty aside from comments, or contains a quad
    assert len(items) == 0 or len(items) == 4, line
    return items
Exemplo n.º 6
0
def do_ftfy(f):
    with open(f, 'r') as f:
        lines = f.readlines()
        for line in lines:
            print(fix_text(decode_escapes(line)).rstrip())
Exemplo n.º 7
0
print(fix_text('ünicode'))

print(fix_text('&lt;3'))

print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))

len(fix_text(''))

explain_unicode('ノ( º _ ºノ) 테스트')

from ftfy.fixes import fix_encoding, unescape_html, uncurl_quotes, fix_line_breaks, decode_escapes

print(fix_encoding('â\x81”.'))

print(unescape_html('&lt;hr&gt;'))

print(uncurl_quotes('\u201ctest\u201d'))

print(fix_line_breaks("1. hello\u2028" "2. world"))

factoid = '\\u20a2'
print(decode_escapes(factoid))

from ftfy.formatting import character_width, display_center

print(character_width('A'))
print(character_width('가'))

lines = ['Display center', 'center']
for line in lines:
    print(display_center(line, 20, '▒'))