def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = ps.cont_exp(x)
    x = ps.remove_emails(x)
    x = ps.remove_urls(x)
    # x = ps.remove_html_tags(x)
    x = ps.remove_accented_chars(x)
    x = ps.remove_special_chars(x)
    x = re.sub("(.)\\1{2,}", "\\1", x)
    return x
Exemplo n.º 2
0
def get_clean(x):
    x = str(x).lower().replace('\\', ' ').replace('_', ' ').replace('.', ' ')
    x = kgp.cont_exp(x)
    x = kgp.remove_emails(x)
    x = kgp.remove_urls(x)
    x = kgp.remove_html_tags(x)
    x = kgp.remove_rt(x)
    x = kgp.remove_accented_chars(x)
    x = kgp.remove_special_chars(x)
    x = kgp.remove_dups_char(x)
    x = kgp.make_base(x)
    return x
Exemplo n.º 3
0
 def text_preprocessing(self, df, col_name):
     column = col_name
     df[column] = df[column].progress_apply(lambda x: str(x).lower())
     df[column] = df[column].progress_apply(lambda x: ps.remove_urls(x))
     df[column] = df[column].progress_apply(
         lambda x: ps.cont_exp(x))  #you're -> you are; i'm -> i am
     df[column] = df[column].progress_apply(lambda x: ps.remove_emails(x))
     df[column] = df[column].progress_apply(
         lambda x: ps.remove_html_tags(x))
     df[column] = df[column].progress_apply(
         lambda x: ps.remove_stopwords(x))
     df[column] = df[column].progress_apply(
         lambda x: ps.remove_special_chars(x))
     df[column] = df[column].progress_apply(
         lambda x: ps.remove_accented_chars(x))
     df[column] = df[column].progress_apply(lambda x: ps.remove_urls(x))
     df[column] = df[column].progress_apply(
         lambda x: ps.make_base(x))  #ran -> run,
     return (df)