예제 #1
0
def remove_brackets(s: TextSeries) -> TextSeries:
    """
    Remove content within brackets and the brackets itself.

    Remove content from any kind of brackets, (), [], {}, <>.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("Texthero (round) [square] [curly] [angle]")
    >>> hero.remove_brackets(s)
    0    Texthero
    dtype: object

    See also
    --------
    :meth:`remove_round_brackets`
    :meth:`remove_curly_brackets`
    :meth:`remove_square_brackets`
    :meth:`remove_angle_brackets`

    """

    return (s.pipe(remove_round_brackets).pipe(remove_curly_brackets).pipe(
        remove_square_brackets).pipe(remove_angle_brackets))
예제 #2
0
def has_content(s: TextSeries) -> TextSeries:
    r"""
    Return a Boolean Pandas Series indicating if the rows have content.

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["content", np.nan, "\t\n", " "])
    >>> hero.has_content(s)
    0     True
    1    False
    2    False
    3    False
    dtype: bool

    """
    return (s.pipe(remove_whitespace) != "") & (~s.isna())
예제 #3
0
def clean(s: TextSeries, pipeline=None) -> TextSeries:
    """
    Pre-process a text-based Pandas Series, by using the following default
    pipeline.

     Default pipeline:
     1. :meth:`texthero.preprocessing.fillna`
     2. :meth:`texthero.preprocessing.lowercase`
     3. :meth:`texthero.preprocessing.remove_digits`
     4. :meth:`texthero.preprocessing.remove_html_tags`
     5. :meth:`texthero.preprocessing.remove_punctuation`
     6. :meth:`texthero.preprocessing.remove_diacritics`
     7. :meth:`texthero.preprocessing.remove_stopwords`
     8. :meth:`texthero.preprocessing.remove_whitespace`

    Parameters
    ----------
    s : :class:`texthero._types.TextSeries`

    pipeline : List[Callable[Pandas Series, Pandas Series]],
               optional, default=None
       Specific pipeline to clean the texts. Has to be a list
       of functions taking as input and returning as output
       a Pandas Series. If None, the default pipeline
       is used.
   
    Examples
    --------
    For the default pipeline:

    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("Uper 9dig.        he her ÄÖÜ")
    >>> hero.clean(s)
    0    uper 9dig aou
    dtype: object
    """

    if not pipeline:
        pipeline = get_default_pipeline()

    for f in pipeline:
        s = s.pipe(f)
    return s
예제 #4
0
def restore_tweets(
    s: TextSeries, pipeline=get_twitter_post_pipeline()) -> TextSeries:
    """
    Pre-process a text-based Pandas Series of tweets, by using the following
    pipeline.

     Twitter pipeline:
     1. :meth:`texthero.preprocessing.fillna`
     2. :meth:`texthero.preprocessing.replace_emojis`
     3. :meth:`texthero.preprocessing.replace_urls`

    Parameters
    ----------
    s : :class:`texthero._types.TextSeries`

    pipeline : List[Callable[Pandas Series, Pandas Series]],
               optional, default=None
       Specific pipeline to clean the texts. Has to be a list
       of functions taking as input and returning as output
       a Pandas Series. If None, the default pipeline
       is used.

    Examples
    --------
    For the default pipeline:

    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("the book of the jungle 😈 https://example.com")
    >>> hero.clean_tweets(s)
    0    the book of the jungle :smiling_face_with_horns: <URL>
    dtype: object
    """

    if not pipeline:
        pipeline = get_twitter_post_pipeline()

    for f in pipeline:
        s = s.pipe(f)
    return s
예제 #5
0
def clean(s: TextSeries, pipeline=None) -> TextSeries:
    """
    Pre-process a text-based Pandas Series, by using the following default
    pipeline.

     Default pipeline:
     1. :meth:`texthero.preprocessing.fillna`
     2. :meth:`texthero.preprocessing.lowercase`
     3. :meth:`texthero.preprocessing.remove_digits`
     4. :meth:`texthero.preprocessing.remove_punctuation`
     5. :meth:`texthero.preprocessing.remove_diacritics`
     6. :meth:`texthero.preprocessing.remove_stopwords`
     7. :meth:`texthero.preprocessing.remove_whitespace`

    Parameters
    ----------
    s : :class:`texthero._types.TextSeries`

    pipeline :List[Callable[[Pandas Series], Pandas Series]]
       inserting specific pipeline to clean a text
   
    Examples
    --------
    For the default pipeline:

    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series("Uper 9dig.        he her ÄÖÜ")
    >>> hero.clean(s)
    0    uper 9dig aou
    dtype: object
    """

    if not pipeline:
        pipeline = get_default_pipeline()

    for f in pipeline:
        s = s.pipe(f)
    return s