Exemplo n.º 1
0
def test_fix_bad_unicode():
    text = "and install a \\u2018new\\u2019 society in their"  # and install a ‘new’ society in their
    assert cleantext.fix_bad_unicode(
        text) == "and install a 'new' society in their"

    assert "všetko" == cleantext.fix_bad_unicode("všetko")
    assert "Všetko" == cleantext.fix_bad_unicode("Všetko")
Exemplo n.º 2
0
def clean(
        docstring: str, language: Optional[str] = None,
        extract_summary: bool = True,
        no_comment_delimiters: bool = True,
        no_html_tags: bool = True,
        no_doctags: bool = True,
        no_urls: bool = True, url_replacement: str = "",
        tokenize: bool = True,
        fix_unicode: bool = True
):
    if no_comment_delimiters:
        docstring = remove_comment_delimiters(docstring)
    if extract_summary:
        docstring = extract_docstring_summary(docstring, language=language)
    if fix_unicode:
        docstring = fix_bad_unicode(docstring)
    if no_urls:
        docstring = remove_urls(docstring, replace_with=url_replacement)
    if no_html_tags and docstring:
        try:
            docstring = remove_html_tags(docstring)
        except Exception as e:
            pass
    if no_doctags:
        docstring = remove_doctags(docstring, keep_inside=True, language=language)
    if tokenize:
        docstring = tokenize_csn(docstring)
    return docstring
Exemplo n.º 3
0
 def line_to_words(self, line):
     words, fonts = [], []
     for word in line["content"]:
         if word["type"] == "word":
             w_fixed = word["content"]
             w_fixed = fix_bad_unicode(w_fixed).strip()
             words.append(w_fixed)
             fonts.append(word["font"])
     return words, fonts
Exemplo n.º 4
0
def test_fix_bad_unicode():
    text = ("and install a \\u2018new\\u2019 society in their"
            )  # and install a ‘new’ society in their
    assert cleantext.fix_bad_unicode(
        text) == "and install a 'new' society in their"
Exemplo n.º 5
0
def only_text(es):
    r = []
    for e in es:
        for x in extract_elements(e, "word"):
            r.append(x["content"].strip())
    return fix_bad_unicode(" ".join(r))