def remove_brackets(s: TextSeries) -> TextSeries: """ Remove content within brackets and the brackets itself. Remove content from any kind of brackets, (), [], {}, <>. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("Texthero (round) [square] [curly] [angle]") >>> hero.remove_brackets(s) 0 Texthero dtype: object See also -------- :meth:`remove_round_brackets` :meth:`remove_curly_brackets` :meth:`remove_square_brackets` :meth:`remove_angle_brackets` """ return (s.pipe(remove_round_brackets).pipe(remove_curly_brackets).pipe( remove_square_brackets).pipe(remove_angle_brackets))
def has_content(s: TextSeries) -> TextSeries: r""" Return a Boolean Pandas Series indicating if the rows have content. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["content", np.nan, "\t\n", " "]) >>> hero.has_content(s) 0 True 1 False 2 False 3 False dtype: bool """ return (s.pipe(remove_whitespace) != "") & (~s.isna())
def clean(s: TextSeries, pipeline=None) -> TextSeries: """ Pre-process a text-based Pandas Series, by using the following default pipeline. Default pipeline: 1. :meth:`texthero.preprocessing.fillna` 2. :meth:`texthero.preprocessing.lowercase` 3. :meth:`texthero.preprocessing.remove_digits` 4. :meth:`texthero.preprocessing.remove_html_tags` 5. :meth:`texthero.preprocessing.remove_punctuation` 6. :meth:`texthero.preprocessing.remove_diacritics` 7. :meth:`texthero.preprocessing.remove_stopwords` 8. :meth:`texthero.preprocessing.remove_whitespace` Parameters ---------- s : :class:`texthero._types.TextSeries` pipeline : List[Callable[Pandas Series, Pandas Series]], optional, default=None Specific pipeline to clean the texts. Has to be a list of functions taking as input and returning as output a Pandas Series. If None, the default pipeline is used. Examples -------- For the default pipeline: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("Uper 9dig. he her ÄÖÜ") >>> hero.clean(s) 0 uper 9dig aou dtype: object """ if not pipeline: pipeline = get_default_pipeline() for f in pipeline: s = s.pipe(f) return s
def restore_tweets( s: TextSeries, pipeline=get_twitter_post_pipeline()) -> TextSeries: """ Pre-process a text-based Pandas Series of tweets, by using the following pipeline. Twitter pipeline: 1. :meth:`texthero.preprocessing.fillna` 2. :meth:`texthero.preprocessing.replace_emojis` 3. :meth:`texthero.preprocessing.replace_urls` Parameters ---------- s : :class:`texthero._types.TextSeries` pipeline : List[Callable[Pandas Series, Pandas Series]], optional, default=None Specific pipeline to clean the texts. Has to be a list of functions taking as input and returning as output a Pandas Series. If None, the default pipeline is used. Examples -------- For the default pipeline: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("the book of the jungle 😈 https://example.com") >>> hero.clean_tweets(s) 0 the book of the jungle :smiling_face_with_horns: <URL> dtype: object """ if not pipeline: pipeline = get_twitter_post_pipeline() for f in pipeline: s = s.pipe(f) return s
def clean(s: TextSeries, pipeline=None) -> TextSeries: """ Pre-process a text-based Pandas Series, by using the following default pipeline. Default pipeline: 1. :meth:`texthero.preprocessing.fillna` 2. :meth:`texthero.preprocessing.lowercase` 3. :meth:`texthero.preprocessing.remove_digits` 4. :meth:`texthero.preprocessing.remove_punctuation` 5. :meth:`texthero.preprocessing.remove_diacritics` 6. :meth:`texthero.preprocessing.remove_stopwords` 7. :meth:`texthero.preprocessing.remove_whitespace` Parameters ---------- s : :class:`texthero._types.TextSeries` pipeline :List[Callable[[Pandas Series], Pandas Series]] inserting specific pipeline to clean a text Examples -------- For the default pipeline: >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series("Uper 9dig. he her ÄÖÜ") >>> hero.clean(s) 0 uper 9dig aou dtype: object """ if not pipeline: pipeline = get_default_pipeline() for f in pipeline: s = s.pipe(f) return s