def decode_url(url): """ Take in a URL that is percent-encoded for use in a format such as HTML or N-triples, and convert it to a Unicode URL. If the URL is contained in angle brackets because it comes from an N-triples file, strip those. >>> decode_url('<http://dbpedia.org/resource/N%C3%BAria_Espert>') 'http://dbpedia.org/resource/Núria_Espert' """ url_bytes = url.strip('<>').encode('utf-8') text = unquote(url_bytes).decode('utf-8', 'replace') try: return decode_escapes(text) except UnicodeDecodeError: return text
def decode_url(url): """ Take in a URL that is percent-encoded for use in a format such as HTML or N-triples, and convert it to a Unicode URL. If the URL is contained in angle brackets because it comes from an N-triples file, strip those. >>> decode_url('<http://dbpedia.org/resource/N%C3%BAria_Espert>') 'http://dbpedia.org/resource/Núria_Espert' """ url_bytes = url.strip('<>').encode('utf-8') text = unquote(url_bytes).decode('utf-8', 'replace') try: return decode_escapes(text) except UnicodeDecodeError: return text
def clean_string(s): s = str(s) if isnull(s): return None elif re.search('[a-zA-Z]', s) is None: return None else: s = remove_bom(s) s = remove_control_chars(s) s = fix_encoding(s) s = fix_text(s) s = fix_partial_utf8_punct_in_1252(s) s = decode_escapes(s) s = fix_latin_ligatures(s) s = uncurl_quotes(s) s = s.replace("Äu0087", "ć") s = s.replace("Äu0090", "Đ") s = s.replace("Ãu0096", "Ö") s = s.replace("Åu008D", "ō") s = s.replace("\\", " ") s = s.replace("/", " ") s = s.replace("ö", "ö") p = re.compile("^\w+[A-Z]{1}\w*$") if p.search(s): # From: https://stackoverflow.com/a/37697078 s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s) new_string = "" p = False for letter in s: if letter in "([": p = True elif letter in ")]": p = False continue if not p: new_string += letter return new_string.strip()
def parse_nquads_line(line): """ Parse a line in N-Triples or N-Quads format, returning four dictionaries: (subj, pred, obj, graph). Each of the dictionaries contains fields that may or may not be present, indicating their parsed content: - 'url': a complete URL indicating a resource. (Pedants: It's an IRI, but it's also a URL.) - 'text': a string value. - 'lang': the language code associated with the given 'text'. - 'type': a URL pointing to something in the 'xsd:' namespace, indicating for how to interpret the given 'text' as a value. - 'blank': the arbitrary ID of a blank node. """ items = [] for match in NQUADS_ITEM_RE.finditer(line): item = {} for group in ['url', 'text', 'lang', 'type', 'blank', 'comment']: matched = match.group(group) if matched is not None: item[group] = matched if 'comment' in item: continue if 'url' in item: item['url'] = decode_url(item['url']) if 'lang' in item: item['lang'] = langcodes.standardize_tag(item['lang']) if 'type' in item: item['type'] = decode_url(item['type']) if 'text' in item: item['text'] = decode_escapes(item['text']) if item: items.append(item) if len(items) == 3: items.append({}) # The line is either empty aside from comments, or contains a quad assert len(items) == 0 or len(items) == 4, line return items
def parse_nquads_line(line): """ Parse a line in N-Triples or N-Quads format, returning four dictionaries: (subj, pred, obj, graph). Each of the dictionaries contains fields that may or may not be present, indicating their parsed content: - 'url': a complete URL indicating a resource. (Pedants: It's an IRI, but it's also a URL.) - 'text': a string value. - 'lang': the language code associated with the given 'text'. - 'type': a URL pointing to something in the 'xsd:' namespace, indicating for how to interpret the given 'text' as a value. - 'blank': the arbitrary ID of a blank node. """ items = [] for match in NQUADS_ITEM_RE.finditer(line): item = {} for group in ['url', 'text', 'lang', 'type', 'blank', 'comment']: matched = match.group(group) if matched is not None: item[group] = matched if 'comment' in item: continue if 'url' in item: item['url'] = decode_url(item['url']) if 'lang' in item: item['lang'] = langcodes.standardize_tag(item['lang']) if 'type' in item: item['type'] = decode_url(item['type']) if 'text' in item: item['text'] = decode_escapes(item['text']) if item: items.append(item) if len(items) == 3: items.append({}) # The line is either empty aside from comments, or contains a quad assert len(items) == 0 or len(items) == 4, line return items
def do_ftfy(f): with open(f, 'r') as f: lines = f.readlines() for line in lines: print(fix_text(decode_escapes(line)).rstrip())
print(fix_text('ünicode')) print(fix_text('<3')) print(fix_text("¯\\_(ã\x83\x84)_/¯")) len(fix_text('')) explain_unicode('ノ( º _ ºノ) 테스트') from ftfy.fixes import fix_encoding, unescape_html, uncurl_quotes, fix_line_breaks, decode_escapes print(fix_encoding('â\x81”.')) print(unescape_html('<hr>')) print(uncurl_quotes('\u201ctest\u201d')) print(fix_line_breaks("1. hello\u2028" "2. world")) factoid = '\\u20a2' print(decode_escapes(factoid)) from ftfy.formatting import character_width, display_center print(character_width('A')) print(character_width('가')) lines = ['Display center', 'center'] for line in lines: print(display_center(line, 20, '▒'))