Пример #1
0
def get_avg_intensity(data, date, period=7, country_list=None):
    """ Calculate Average Intensity of the last 7 days for a particular date

        Parameters:
        data: sample of the main server ping data frame
        date: string, with the format of 'yyyyMMdd'
        period: The number of days before to run the analysis on.
        country_list: a list of country names in string

        Returns:
        a dataframe with three columns: 'submission_date_s3', 'country', 'avg_intensity'
    """
    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    data_agg = data_all\
        .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\
        .filter("subsession_length <= 86400")\
        .filter("subsession_length > 0")\
        .filter('active_ticks <= 17280')\
        .groupBy('country', 'client_id', 'submission_date_s3')\
        .agg(F.sum('subsession_length').alias('total_daily_time'),
             F.sum('active_ticks').alias('total_daily_ticks'))\
        .select('country',
                'client_id',
                'submission_date_s3',
                F.when(F.col('total_daily_time') > 86400, 86400)
                 .otherwise(F.col('total_daily_time'))
                 .alias('total_daily_time'),
                F.when(F.col('total_daily_ticks') > 17280, 17280)
                 .otherwise(F.col('total_daily_ticks'))
                 .alias('total_daily_ticks'))\
        .select('*',
                (col('total_daily_ticks') * 5 / col('total_daily_time'))
                .alias('daily_intensity'))\
        .select('country',
                'client_id',
                'submission_date_s3',
                F.when(F.col('daily_intensity') > 1, 1)
                 .otherwise(F.col('daily_intensity'))
                 .alias('daily_intensity'))

    country_avg_intensity = data_agg\
        .groupBy('country', 'client_id')\
        .agg(F.avg('daily_intensity').alias('avg_7d_intensity'))\
        .groupBy('country')\
        .agg(F.avg('avg_7d_intensity').alias('avg_intensity'))\
        .select(lit(date).alias('submission_date_s3'), '*')

    df = country_avg_intensity.orderBy('submission_date_s3', 'country')

    return df
Пример #2
0
def top_10_addons_on_date(data, date, topN, period=7, country_list=None):
    """ Gets the number of users in the past week who have used the top N addons,
        broken down by country.

        Parameters:
        data - The main ping server.
        date - The day you which you want to get the top N addons.
        topN - the number of addons to get.
        period - number of days to use to calculate metric
        country_list - a list of country names in string

        Returns:
        Dataframe containing the number of users using each of the addons.
        submission_date_s3, country, addon_id, name, percent_of_active_users
    """
    addon_filter = (~col('addon.is_system')) & (~col('addon.foreign_install')) & \
        (~col('addon.addon_id').isin(NON_MOZ_TP)) & (~col('addon.addon_id').like('%@mozilla%')) &\
        (~col('addon.addon_id').like('%@shield.mozilla%')) &\
        (~col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%'))

    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    wau = data_all.filter((col('submission_date_s3') > begin) &
                          (col('submission_date_s3') <= date))\
        .groupBy('country')\
        .agg(lit(date).alias('submission_date_s3'),
             F.countDistinct('client_id').alias('wau'))

    counts = data_all.select('submission_date_s3', 'country', 'client_id',
                             F.explode('active_addons').alias('addon'))\
        .filter((col('submission_date_s3') > begin) &
                (col('submission_date_s3') <= date))\
        .filter(addon_filter)\
        .select('country', 'client_id', 'addon.addon_id', 'addon.name')\
        .distinct()\
        .groupBy('country', 'addon_id')\
        .agg(F.count('*').alias('number_of_users'), F.last('name').alias('name'))\
        .select('*', lit(date).alias('submission_date_s3'),
                lit(begin).alias('start_date'),
                F.row_number().over(Window.partitionBy('country')
                                    .orderBy(desc('number_of_users'))
                                    .rowsBetween(Window.unboundedPreceding, Window.currentRow))
                              .alias('rank'))\
        .filter(col('rank') <= topN)

    return counts.join(F.broadcast(wau), on=['country'], how='left')\
        .select(lit(date).alias('submission_date_s3'), 'country',
                'addon_id', col('name').alias('addon_name'),
                (100.0 * col('number_of_users') / col('wau')).alias('pct_with_addon'))
Пример #3
0
def getPAU(data, date, period, sample_factor=1, country_list=None):
    """ Calculates the PAU for a given period for each time in epoch_times.

        This function is fast for finding the PAU for a small number of dates.

        Paramaters:

        data - This should be a sample of the main server ping data frame.
        date - The day to calulate PAU for. This is given in epoch time.
        period - The number of days that we count distinct number of users.
                 For example MAU has a period = 28 and YAU has a period = 365.
        sample_factor - the factor to multiply counts by, pre-calculated based
                        on sample
        country_list - A list of countries that we want to calculate the
                       PAU for.

        Output:

        A data frame, this data frame has 3 columns
            submission_date_s3, country, PAU(WAU/MAU/YAU).
    """
    def process_data(data, begin, date):
        return (data.filter(col('submission_date_s3') > begin).filter(
            col('submission_date_s3') <= date).groupBy('country').agg(
                (sample_factor *
                 countDistinct('client_id')).alias(active_users_col)).select(
                     '*',
                     lit(begin).alias(start_date_col),
                     lit(date).alias('submission_date_s3')))

    data_all = data.drop('country').select('*', lit('All').alias('country'))
    if country_list is not None:
        data_country = data.filter(col('country').isin(country_list))
    # define column names based on period
    active_users_col = PERIOD_DESC.get(period, "other")
    start_date_col = 'start_date_' + PERIOD_DESC.get(period, "other")

    begin = date_plus_x_days(date, -period)

    current_count = process_data(data_all, begin, date)
    if country_list is not None:
        df_country = process_data(data_country, begin, date)
        current_count = current_count.union(df_country)

    return current_count.select('submission_date_s3', 'country',
                                active_users_col)
Пример #4
0
def get_daily_avg_session(data, date, period=7, country_list=None):
    """ Calculate Average Daily usage of the last 7 days for a particular date

        Parameters:
        data: sample of the main server ping data frame
        date: string, with the format of 'yyyyMMdd'
        country_list: a list of country names in string

        Returns:
        a dataframe with four columns:
            'submission_date_s3',
            'country',
            'avg_daily_usage(hours)'
    """

    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    data_agg = data_all\
        .filter("submission_date_s3 <= '{}' and submission_date_s3 > '{}'"
                .format(date, begin))\
        .filter("subsession_length <= 86400") .filter("subsession_length > 0")\
        .groupBy('country',
                 'client_id',
                 'submission_date_s3')\
        .agg(F.sum('subsession_length').alias('total_daily_time'))\
        .select('country',
                'client_id',
                'submission_date_s3',
                F.when(F.col('total_daily_time') > 86400, 86400)
                 .otherwise(F.col('total_daily_time'))
                 .alias('total_daily_time'))

    country_avg_session = data_agg\
        .groupBy('country', 'client_id')\
        .agg(F.avg('total_daily_time').alias('client_7d_avg'))\
        .groupBy('country')\
        .agg(F.avg('client_7d_avg').alias('avg_daily_subsession_length'))\
        .select(lit(date).alias('submission_date_s3'), '*')

    df = country_avg_session.orderBy('submission_date_s3', 'country')

    df = df.withColumn('avg_daily_usage(hours)',
                       df.avg_daily_subsession_length / 3600)

    return df.select('submission_date_s3', 'country', 'avg_daily_usage(hours)')
Пример #5
0
def new_users(data, date, period=7, country_list=None):
    """Gets the percentage of WAU that are new users.

        Parameters:

        data - This should be the entire main server ping data frame.
        date -  data to start calculating for
        period - The number of days before looked at in the analysis
        country_list - A list of countries that we want to calculate the
                       PAU for.

        Returns:
          A dataframe with columns
            submission_date_s3, country, pct_new_users
    """

    cols = ['submission_date_s3', 'client_id', 'profile_creation_date',
            'country']

    wau = getWAU(data, date, country_list)
    df = data.drop('country').select('*', lit('All').alias('country'))

    if country_list is not None:
        df = (
            df.select(cols).union(data.select(cols)
                                  .filter(col('country').isin(country_list))))
    begin = date_plus_x_days(date, -period)
    new_profiles = (df.filter(df.submission_date_s3 <= date)
                      .filter(df.submission_date_s3 > begin)
                      .withColumn('pcd_str',
                                  from_unixtime(col('profile_creation_date') * 24 * 60 * 60,
                                                format='yyyyMMdd'))
                      .filter(col('pcd_str') <= date)
                      .filter(col('pcd_str') > begin))

    new_user_counts = (
          new_profiles
          .groupBy('country')
          .agg((countDistinct('client_id')).alias('new_users')))

    return wau.join(new_user_counts, on=['country'], how='left')\
              .select('submission_date_s3',
                      'country',
                      (100.0 * col('new_users') / col('WAU')).alias('pct_new_user'))
def pct_new_version(data,
                    date,
                    period=7,
                    country_list=None,
                    url=RELEASE_VERSIONS_URL):
    """ Calculate the proportion of active users on the latest release version every day.
        Parameters:
        data: sample of the main server ping data frame
        date: The day to calculate the metric
        period: number of days to use to calculate metric
        country_list: a list of country names in string
        url: path to the json file containing all the firefox release information to date
        Returns:
        a dataframe with five columns - 'country', 'submission_date_s3',
                                        'pct_latest_version'
    """

    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    latest_version = get_latest_version(date, url)
    data_filtered = data_all.filter("""
                                    {0} >= '{1}' and {0} <= '{2}'
                                    """.format("submission_date_s3", begin, date))\
                            .withColumn('app_major_version',
                                        split('app_version',
                                              '\.').getItem(0))\
                            .select('submission_date_s3',
                                    'client_id',
                                    'app_major_version',
                                    'country')

    WAU = data_filtered.groupBy('country')\
                       .agg(F.countDistinct('client_id').alias('WAU'))
    WAU_latest = data_filtered.filter(F.col('app_major_version') >= F.lit(latest_version))\
                              .groupBy('country')\
                              .agg(F.countDistinct('client_id').alias('WAU_is_latest'))
    join_df = WAU.join(WAU_latest, 'country', 'left')\
                 .withColumn("pct_latest_version", (100.0 * F.col("WAU_is_latest") / F.col("WAU")))\
                 .select(F.lit(date).alias('submission_date_s3'),
                         'country',
                         F.coalesce('pct_latest_version', F.lit(0)).alias('pct_latest_version'))
    return join_df
Пример #7
0
def os_on_date(data, date, period=7, country_list=None):
    """ Gets the distribution of OS usage calculated on the WAU on 1 day.

        Parameters:
        data: Usually the main summary data frame.
        date: day to get the os distribution for the past week.
        period: The number of days to calculate the distibution. By default it finds os
                distribution over a week.
        country_list: The countries to do the analysis. If None then it does it for the whole
                      world.

        Returns:
            submission_date_s3, country, os, pct_on_os
       """

    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)
    data_all = data_all.select(
        'client_id', 'submission_date_s3', 'country',
        nice_os(col('os'), col('os_version')).alias('nice_os'))

    # Calculate the WAU
    wau = data_all\
        .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\
        .groupBy('country')\
        .agg(countDistinct('client_id').alias('WAU'))\

    os_wau = data_all\
        .filter((col('submission_date_s3') <= date) &
                (col('submission_date_s3') > begin))\
        .groupBy('country', 'nice_os')\
        .agg(countDistinct('client_id').alias('WAU_on_OS'))\
        .select(lit(begin).alias('start_date'), lit(date).alias('submission_date_s3'),
                'country', 'WAU_on_OS', 'nice_os')

    res = os_wau.join(wau, 'country', how='left')\
                .select('start_date', 'submission_date_s3',
                        'country', 'WAU_on_OS', 'nice_os', 'WAU')

    return res.select('submission_date_s3', 'country',
                      col('nice_os').alias('os'),
                      (100.0 * col('WAU_on_OS') /
                       col('WAU')).alias('pct_on_os'))
Пример #8
0
def get_addon(data, date, period=7, country_list=None):
    """ Calculate the proportion of WAU that have a "self installed" addon for a specific date

        Parameters:
            data: sample of the main server ping data frame
            date: string, with the format of 'yyyyMMdd'
            period: The number of days before looked at in the analysis
            country_list: a list of country names in string

        Returns:
            a dataframe showing all the information for each date in the period
              - three columns: 'submission_date_s3', 'country', 'pct_Addon'
    """

    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    addon_filter = (~F.col('addon.is_system')) & (~F.col('addon.foreign_install')) &\
                   (~F.col('addon.addon_id').isin(NON_MOZ_TP)) &\
                   (~F.col('addon.addon_id').like('%@mozilla%')) &\
                   (~F.col('addon.addon_id').like('%@shield.mozilla%')) &\
                   (~F.col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%'))

    WAU = data_all\
        .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\
        .groupBy('country')\
        .agg(F.countDistinct('client_id').alias('WAU'))

    addon_count = data_all\
        .filter("submission_date_s3 <= '{0}' and submission_date_s3 > '{1}'".format(date, begin))\
        .select('submission_date_s3', 'country', 'client_id',
                F.explode('active_addons').alias('addon'))\
        .filter(addon_filter)\
        .groupBy('country')\
        .agg(F.countDistinct('client_id').alias('add_on_count'))

    join_df = WAU.join(addon_count, 'country', how='left')\
        .withColumn("pct_addon", (100.0 * F.col("add_on_count") / F.col("WAU")))\
        .select(F.lit(date).alias('submission_date_s3'), '*')

    return join_df.select('submission_date_s3', 'country', 'pct_addon')
def locale_on_date(data, date, topN, period=7, country_list=None):
    """ Gets the ratio of the top locales in each country over the last week.

    parameters:
        data: The main ping server
        date: The date to find the locale distribution
        topN: The number of locales to get for each country. Only does the top N.
        period: The number of days before looked at in the analyisis
        country_list: The list to find look at in the analysis

    output:
       dataframe with columns:
           ['country', 'submission_date_s3', 'locale', 'pct_on_locale']
    """
    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    wau = data_all\
        .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\
        .groupBy('country')\
        .agg(countDistinct('client_id').alias('WAU'))

    locale_wau = data_all\
        .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\
        .groupBy('country', 'locale')\
        .agg(countDistinct('client_id').alias('WAU_on_locale'))\
        .select(lit(begin).alias('start_date'), lit(date).alias('submission_date_s3'),
                'country', 'WAU_on_locale', 'locale')

    res = locale_wau.join(wau, 'country', how='left')\
        .select('start_date', 'submission_date_s3',
                'country', 'WAU_on_locale', 'locale', 'WAU')

    rank_window = Window.partitionBy('country', 'submission_date_s3').orderBy(
        desc('WAU_on_locale'))

    return res.select('*', F.row_number().over(rank_window).alias('rank'))\
        .filter(col('rank') <= topN)\
        .select('submission_date_s3', 'country', 'locale',
                (100.0 * col('WAU_on_locale') / col('WAU')).alias('pct_on_locale'))