示例#1
0
def test_replace_urls():
    texts = [
        [
            "I learned everything I know from www.stackoverflow.com and http://wikipedia.org/ and Mom.",
            "I learned everything I know from *URL* and *URL* and Mom.",
        ],
        [
            "There's a bunch of references in that one scene alone, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29), which comes out later this year.",
            "There's a bunch of references in that one scene alone, including [Moana](*URL*), which comes out later this year.",
        ],
    ]

    for text, proc_text in texts:
        assert cleantext.replace_urls(text, "*URL*") == proc_text
示例#2
0
    def clean(cls, text):
        """Cleans text for language classification.

        Args:
            text (str): Source text

        Returns:
            str: Cleaned text
        """
        import cleantext

        cleaned = text
        cleaned = cleantext.replace_urls(cleaned, replace_with='')
        cleaned = cleantext.replace_emails(cleaned, replace_with='')
        return cleaned
# !pip install clean-text[gpl]

import cleantext

# replacing urls 
text = "www.stackoverflow.com is an amzing website"
cleantext.replace_urls(text, "<URL>")
>>>'<URL> is an amzing website'

# replacing emails
text = "My email id is [email protected]"
cleantext.replace_emails(text, "<EMAIL>")
>>>'My email id is <EMAIL>'
示例#4
0
def remove_urls(text: str, replace_with: str = None) -> str:
    """ Removes URLS from text"""
    return replace_urls(text, replace_with=replace_with)