Пример #1
0
def get_userdf_parts(user_df):
    """
    Parse user profile part of json

    user_df (dict):
        Raw dictionary from json that contains user information

    Returns
    user_df_kept (dict)
        Subset of user_df with keys of interest

    Parses user file... this section contains a lot of person information which I purposely try to avoid

    """
    keep_keys = [
        'active_time', 'age_filter_max', 'age_filter_min', 'bio', 'birth_date',
        'connection_count', 'create_date', 'education', 'gender',
        'gender_filter', 'jobs', 'schools'
    ]

    for key in keep_keys:
        if key not in user_df:
            raise IndexError(key + " not in input keys: " +
                             str(user_df.keys()))

    user_df_kept = {k: v for k, v in user_df.items() if k in keep_keys}
    user_df_kept['birth_date'] = user_df_kept['birth_date'][:10]

    user_df_kept = utils.check_dict_types(user_df_kept)
    return (user_df_kept)
def gather_usage_stats(usage_df):

    usage_sums = usage_df.sum()
    derived_metrics = {}
    # Number of swipes
    derived_metrics["total_swipes"] = usage_sums['swipes_likes'] + usage_sums[
        'swipes_passes']
    # Likes to passes (for every 1 pass there are x likes )
    derived_metrics["like_to_pass_ratio"] = usage_sums[
        'swipes_likes'] / usage_sums['swipes_passes']
    # Number of swipes per app open
    derived_metrics["swipes/app_open"] = (
        usage_sums['swipes_likes'] +
        usage_sums['swipes_passes']) / usage_sums['app_opens']
    # Avg Messages recieved per match
    derived_metrics["n_avg_msg_rec_per_match"] = usage_sums[
        'messages_received'] / usage_sums['matches']
    # Avg Messages sent per match
    derived_metrics["n_avg_msg_sent_per_match"] = usage_sums[
        'messages_sent'] / usage_sums['matches']
    # Calendar days on tinder
    days_on_tinder = pd.to_datetime(usage_df.iloc[-1].name) - pd.to_datetime(
        usage_df.iloc[0].name)
    days_on_tinder = days_on_tinder.days
    # Days when you opened the app
    active_days_on_tinder = len(usage_df.index)

    # Avg Swipes per day
    derived_metrics["swipes_per_tot_cal_day"] = derived_metrics[
        "total_swipes"] / days_on_tinder
    derived_metrics["swipes_per_act_day"] = derived_metrics[
        "total_swipes"] / active_days_on_tinder

    derived_metrics = {k: np.round(v, 2) for k, v in derived_metrics.items()}

    derived_metrics = utils.check_dict_types(derived_metrics)
    return (derived_metrics)
Пример #3
0
def get_message_metrics(message_df):
    """
    Get metrics from message dataframe

    message_df (pandas DataFrame):
        Dataframe with info about messages sent, time, content of message, and whom it was sent

    Returns
    metrics_to_save (dict):
        dictionary of name of metric and value
    """

    # Input Validation
    expected_columns = ['sent_date', 'n_words_in_msg', 'message']
    for col in expected_columns:
        if col not in message_df.columns:
            raise IndexError(col + " not in dataframe columns " +
                             message_df.columns)

    expected_index_keys = ['match_id', 'msg_number']
    for col in expected_index_keys:
        if col not in message_df.index.names:
            raise IndexError(col + " not in dataframe columns " +
                             message_df.index.names)

    # Set up -- needed for multi index slicing
    idx = pd.IndexSlice

    # Metrics to save
    metrics_to_save = {}
    metrics_to_save["Date of First Message Sent"] = message_df[
        'sent_date'].min().strftime('%b %d %Y')
    metrics_to_save["Date of Last Message Sent"] = message_df['sent_date'].max(
    ).strftime('%b %d %Y')
    metrics_to_save["Number of Matches"] = message_df.index.get_level_values(
        'match_id').nunique()
    metrics_to_save["Number Matches with no Messages"] = message_df.loc[
        idx[:, -1], ].shape[0]
    metrics_to_save[
        "Most Number of Messages Sent to a Match"] = message_df.index.get_level_values(
            'msg_number').max()
    metrics_to_save["Average Number of Words per Message"] = message_df[
        'n_words_in_msg'].mean().round(2)
    metrics_to_save["Median Number of Words per Message"] = np.round(
        message_df['n_words_in_msg'].quantile(0.50), 2)
    # metrics_to_save["Average Messages Per Match"] = message_df.index.get_level_values('msg_number').mean().round(3)

    # Time calculations
    time_diff_days = (message_df['sent_date'].max() -
                      message_df['sent_date'].min()).days
    years = int(time_diff_days / 365)
    months = int((time_diff_days % 365) / 30)
    days = (time_diff_days % 365) % 30
    metrics_to_save["Total Time on Tinder from First Message to Last"] = "{} years {} months {} days" \
        .format(years, months, days)

    # First message analysis
    first_msg = message_df.loc[idx[:, 0], "message"]
    first_msg = pd.DataFrame(first_msg)

    # Clean and generate additional flags
    first_msg['message'] = first_msg['message'].str.rstrip(' ')
    first_msg['message'] = first_msg['message'].str.rstrip('  ')
    same_first_message = first_msg['message'].value_counts().sort_values(
        ascending=False)

    first_msg['hey_hi_flag'] = first_msg['message'].str.count(
        '((H|h)ey|(H|h)i)')
    first_msg['How \'sit going_flag'] = first_msg['message'].str.contains(
        'it going')

    # Save some metrics
    metrics_to_save["Hey or Hi in First Message"] = first_msg[
        'hey_hi_flag'].sum()
    metrics_to_save["(How's ) it going in First Message"] = first_msg[
        'How \'sit going_flag'].sum()
    metrics_to_save["Most Common First Message"] = same_first_message.index[0]
    metrics_to_save[
        "Number of Times Most Common First Message Used"] = same_first_message[
            0]
    metrics_to_save[
        "Second Most Common First Message"] = same_first_message.index[1]
    metrics_to_save[
        "Number of Times Second Most Common First Message Used"] = same_first_message[
            1]
    metrics_to_save[
        "Third Most Common First Message"] = same_first_message.index[2]
    metrics_to_save[
        "Number of Times Third Most Common First Message Used"] = same_first_message[
            2]

    metrics_to_save = utils.check_dict_types(metrics_to_save)
    return (metrics_to_save)