def test_fix_bad_unicode(): text = "and install a \\u2018new\\u2019 society in their" # and install a ‘new’ society in their assert cleantext.fix_bad_unicode( text) == "and install a 'new' society in their" assert "všetko" == cleantext.fix_bad_unicode("všetko") assert "Všetko" == cleantext.fix_bad_unicode("Všetko")
def clean( docstring: str, language: Optional[str] = None, extract_summary: bool = True, no_comment_delimiters: bool = True, no_html_tags: bool = True, no_doctags: bool = True, no_urls: bool = True, url_replacement: str = "", tokenize: bool = True, fix_unicode: bool = True ): if no_comment_delimiters: docstring = remove_comment_delimiters(docstring) if extract_summary: docstring = extract_docstring_summary(docstring, language=language) if fix_unicode: docstring = fix_bad_unicode(docstring) if no_urls: docstring = remove_urls(docstring, replace_with=url_replacement) if no_html_tags and docstring: try: docstring = remove_html_tags(docstring) except Exception as e: pass if no_doctags: docstring = remove_doctags(docstring, keep_inside=True, language=language) if tokenize: docstring = tokenize_csn(docstring) return docstring
def line_to_words(self, line): words, fonts = [], [] for word in line["content"]: if word["type"] == "word": w_fixed = word["content"] w_fixed = fix_bad_unicode(w_fixed).strip() words.append(w_fixed) fonts.append(word["font"]) return words, fonts
def test_fix_bad_unicode(): text = ("and install a \\u2018new\\u2019 society in their" ) # and install a ‘new’ society in their assert cleantext.fix_bad_unicode( text) == "and install a 'new' society in their"
def only_text(es): r = [] for e in es: for x in extract_elements(e, "word"): r.append(x["content"].strip()) return fix_bad_unicode(" ".join(r))