def replace_urls_w_placeholder(s: TextSeries) -> TextSeries: copy = s.copy() url_pattern = r"(http\S+)" urls_found_list = copy.str.extractall( url_pattern).reset_index()[0].unique() for url in urls_found_list: copy = copy.str.replace(url, _add_url_placeholder(url), regex=False) return copy
def replace_mentions_w_placeholder(s: TextSeries) -> TextSeries: copy = s.copy() mention_pattern = r"(@[a-zA-Z0-9]+)" mentions_found_list = copy.str.extractall( mention_pattern).reset_index()[0].unique() for mention in mentions_found_list: copy = copy.str.replace(mention, _add_mention_placeholder(mention), regex=False) return copy
def replace_hashtags_w_placeholder(s: TextSeries) -> TextSeries: copy = s.copy() hashtag_pattern = r"(#[a-zA-Z0-9_]+)" hashtags_found_list = copy.str.extractall( hashtag_pattern).reset_index()[0].unique() for hashtag in hashtags_found_list: copy = copy.str.replace(hashtag, _add_hashtag_placeholder(hashtag), regex=False) return copy