def feature_number_of_emojis_in_tweet(df : pd.DataFrame, df_timeline : pd.DataFrame, user_id_str_col_name : str = "user_id_str", in_col_name : str = "text", out_col_name : str = "number_of_emoji") -> pd.DataFrame: ress = [] for user_id_str in df[user_id_str_col_name].unique(): nums = df_timeline.loc[df_timeline[user_id_str_col_name] == user_id_str, in_col_name].apply(lambda x: emoji_count(x)).tolist() res = distribution_feature(nums) res[user_id_str_col_name] = user_id_str ress.append(res) df_ress = pd.DataFrame(ress) df_ress.columns = [c if c == user_id_str_col_name else f"{out_col_name}.{c}" for c in df_ress.columns] df = df.merge(df_ress) return df
def feature_number_of_emoji_in_description(df : pd.DataFrame, in_col_name : str = "description", out_col_name : str = "number_of_emoji_in_description") -> pd.DataFrame: df[out_col_name] = df[in_col_name].apply(lambda x: emoji_count(x)) return df
def feature_ratio_of_emoji_in_description(df : pd.DataFrame, in_col_name : str = "description", out_col_name : str = "ratio_of_emoji_in_description") -> pd.DataFrame: df[out_col_name] = df[in_col_name].apply(lambda x: emoji_count(x) / len(x) if len(x) > 0 else 0) return df
def feature_number_of_emoji_in_user_name(df : pd.DataFrame, in_col_name : str = "name", out_col_name : str = "number_of_emoji_in_user_name") -> pd.DataFrame: df[out_col_name] = df[in_col_name].apply(lambda x: emoji_count(x)) return df