def get_userdf_parts(user_df): """ Parse user profile part of json user_df (dict): Raw dictionary from json that contains user information Returns user_df_kept (dict) Subset of user_df with keys of interest Parses user file... this section contains a lot of person information which I purposely try to avoid """ keep_keys = [ 'active_time', 'age_filter_max', 'age_filter_min', 'bio', 'birth_date', 'connection_count', 'create_date', 'education', 'gender', 'gender_filter', 'jobs', 'schools' ] for key in keep_keys: if key not in user_df: raise IndexError(key + " not in input keys: " + str(user_df.keys())) user_df_kept = {k: v for k, v in user_df.items() if k in keep_keys} user_df_kept['birth_date'] = user_df_kept['birth_date'][:10] user_df_kept = utils.check_dict_types(user_df_kept) return (user_df_kept)
def gather_usage_stats(usage_df): usage_sums = usage_df.sum() derived_metrics = {} # Number of swipes derived_metrics["total_swipes"] = usage_sums['swipes_likes'] + usage_sums[ 'swipes_passes'] # Likes to passes (for every 1 pass there are x likes ) derived_metrics["like_to_pass_ratio"] = usage_sums[ 'swipes_likes'] / usage_sums['swipes_passes'] # Number of swipes per app open derived_metrics["swipes/app_open"] = ( usage_sums['swipes_likes'] + usage_sums['swipes_passes']) / usage_sums['app_opens'] # Avg Messages recieved per match derived_metrics["n_avg_msg_rec_per_match"] = usage_sums[ 'messages_received'] / usage_sums['matches'] # Avg Messages sent per match derived_metrics["n_avg_msg_sent_per_match"] = usage_sums[ 'messages_sent'] / usage_sums['matches'] # Calendar days on tinder days_on_tinder = pd.to_datetime(usage_df.iloc[-1].name) - pd.to_datetime( usage_df.iloc[0].name) days_on_tinder = days_on_tinder.days # Days when you opened the app active_days_on_tinder = len(usage_df.index) # Avg Swipes per day derived_metrics["swipes_per_tot_cal_day"] = derived_metrics[ "total_swipes"] / days_on_tinder derived_metrics["swipes_per_act_day"] = derived_metrics[ "total_swipes"] / active_days_on_tinder derived_metrics = {k: np.round(v, 2) for k, v in derived_metrics.items()} derived_metrics = utils.check_dict_types(derived_metrics) return (derived_metrics)
def get_message_metrics(message_df): """ Get metrics from message dataframe message_df (pandas DataFrame): Dataframe with info about messages sent, time, content of message, and whom it was sent Returns metrics_to_save (dict): dictionary of name of metric and value """ # Input Validation expected_columns = ['sent_date', 'n_words_in_msg', 'message'] for col in expected_columns: if col not in message_df.columns: raise IndexError(col + " not in dataframe columns " + message_df.columns) expected_index_keys = ['match_id', 'msg_number'] for col in expected_index_keys: if col not in message_df.index.names: raise IndexError(col + " not in dataframe columns " + message_df.index.names) # Set up -- needed for multi index slicing idx = pd.IndexSlice # Metrics to save metrics_to_save = {} metrics_to_save["Date of First Message Sent"] = message_df[ 'sent_date'].min().strftime('%b %d %Y') metrics_to_save["Date of Last Message Sent"] = message_df['sent_date'].max( ).strftime('%b %d %Y') metrics_to_save["Number of Matches"] = message_df.index.get_level_values( 'match_id').nunique() metrics_to_save["Number Matches with no Messages"] = message_df.loc[ idx[:, -1], ].shape[0] metrics_to_save[ "Most Number of Messages Sent to a Match"] = message_df.index.get_level_values( 'msg_number').max() metrics_to_save["Average Number of Words per Message"] = message_df[ 'n_words_in_msg'].mean().round(2) metrics_to_save["Median Number of Words per Message"] = np.round( message_df['n_words_in_msg'].quantile(0.50), 2) # metrics_to_save["Average Messages Per Match"] = message_df.index.get_level_values('msg_number').mean().round(3) # Time calculations time_diff_days = (message_df['sent_date'].max() - message_df['sent_date'].min()).days years = int(time_diff_days / 365) months = int((time_diff_days % 365) / 30) days = (time_diff_days % 365) % 30 metrics_to_save["Total Time on Tinder from First Message to Last"] = "{} years {} months {} days" \ .format(years, months, days) # First message analysis first_msg = message_df.loc[idx[:, 0], "message"] first_msg = pd.DataFrame(first_msg) # Clean and generate additional flags first_msg['message'] = first_msg['message'].str.rstrip(' ') first_msg['message'] = first_msg['message'].str.rstrip(' ') same_first_message = first_msg['message'].value_counts().sort_values( ascending=False) first_msg['hey_hi_flag'] = first_msg['message'].str.count( '((H|h)ey|(H|h)i)') first_msg['How \'sit going_flag'] = first_msg['message'].str.contains( 'it going') # Save some metrics metrics_to_save["Hey or Hi in First Message"] = first_msg[ 'hey_hi_flag'].sum() metrics_to_save["(How's ) it going in First Message"] = first_msg[ 'How \'sit going_flag'].sum() metrics_to_save["Most Common First Message"] = same_first_message.index[0] metrics_to_save[ "Number of Times Most Common First Message Used"] = same_first_message[ 0] metrics_to_save[ "Second Most Common First Message"] = same_first_message.index[1] metrics_to_save[ "Number of Times Second Most Common First Message Used"] = same_first_message[ 1] metrics_to_save[ "Third Most Common First Message"] = same_first_message.index[2] metrics_to_save[ "Number of Times Third Most Common First Message Used"] = same_first_message[ 2] metrics_to_save = utils.check_dict_types(metrics_to_save) return (metrics_to_save)