예제 #1
0
def test_multiple_sources_transform(effective_version,
                                    generate_main_summary_data,
                                    generate_new_profile_data):
    main_summary = generate_main_summary_data([
        {
            "client_id": "1"
        },
        {
            "client_id": "3"
        },
    ])
    new_profile = generate_new_profile_data([
        {
            "client_id": "1"
        },
        {
            "client_id": "2"
        },
        {
            "client_id": "2"
        },
    ])
    sources = job.extract(main_summary, new_profile, WEEK_START_DS, 1, 0,
                          False)
    df = job.transform(sources, effective_version, WEEK_START_DS)

    # There are two different channels
    assert df.count() == 2

    assert (df.select(
        F.sum("n_profiles").alias("n_profiles")).first().n_profiles) == 3
예제 #2
0
def test_extract_main_summary(spark, generate_main_summary_data):
    df = job.extract(
        generate_main_summary_data(None),
        spark.createDataFrame([], data.new_profile_schema),
        WEEK_START_DS,
        1,
        0,
        False,
    )
    assert df.count() == 1
예제 #3
0
def test_extract_new_profile(spark, generate_new_profile_data):
    df = job.extract(spark.createDataFrame([], data.main_summary_schema),
                     generate_new_profile_data([dict()]), WEEK_START_DS, 1, 0,
                     False)
    assert df.count() == 1

    row = df.first()
    assert row['subsession_length'] is None
    assert (row['profile_creation_date'] ==
            data.new_profile_sample['environment']['profile']['creation_date'])
    assert row['scalar_parent_browser_engagement_total_uri_count'] is None
예제 #4
0
def test_ignored_submissions_outside_of_period(spark,
                                               generate_main_summary_data):
    # All pings within 17 days of the submission start date are valid.
    # However, only pings with ssd within the 7 day retention period
    # are used for computation. Generate pings for this case.
    late_submission = data.generate_dates(SUBSESSION_START,
                                          submission_offset=18)
    early_subsession = data.generate_dates(SUBSESSION_START.replace(days=-7))
    late_submissions_df = generate_main_summary_data(
        [late_submission, early_subsession])

    df = job.extract(late_submissions_df,
                     spark.createDataFrame([], data.new_profile_schema),
                     WEEK_START_DS, 7, 10, False)
    assert df.count() == 0
예제 #5
0
def main(start_date, path, input_bucket, input_prefix, period, slack, sample):

    spark = SparkSession.builder.appName('retention').getOrCreate()
    spark.conf.set('spark.sql.session.timeZone', 'UTC')

    start_ds = utils.format_date(arrow.get(start_date, utils.DS_NODASH),
                                 utils.DS_NODASH, -slack)

    main_summary = (spark.read.option('mergeSchema', 'true').parquet(
        utils.format_spark_path(input_bucket, input_prefix)))

    new_profile = (spark.read.parquet(
        "s3://net-mozaws-prod-us-west-2-pipeline-data/"
        "telemetry-new-profile-parquet/v1/"))

    extracted = churn_job.extract(main_summary, new_profile, start_ds, period,
                                  slack, sample)

    retention = transform(extracted, start_ds)
    save(retention, path)
예제 #6
0
def test_attribution_from_new_profile(
    effective_version, generate_main_summary_data, generate_new_profile_data
):
    main_summary = generate_main_summary_data(
        [
            {"client_id": "1", "attribution": {"source": "mozilla.org"}},
            {"client_id": "3", "attribution": None},
            {"client_id": "4", "attribution": None},
            {
                "client_id": "5",
                "attribution": {"source": "mozilla.org"},
                "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9,
            },
            {
                "client_id": "6",
                "attribution": {"source": "mozilla.org"},
                "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9,
            },
            {"client_id": "7", "attribution": {"source": "mozilla.org"}},
        ]
    )

    def update_attribution(attribution):
        # only updates the attribution section in the environment
        env = copy.deepcopy(data.new_profile_sample["environment"])
        env["settings"]["attribution"] = attribution
        return env

    new_profile = generate_new_profile_data(
        [
            # new profile without a main summary companion
            {
                "client_id": "2",
                "environment": update_attribution({"source": "mozilla.org"}),
            },
            # recover null attribution
            {
                "client_id": "3",
                "environment": update_attribution({"source": "mozilla.org"}),
            },
            # new-profile ping used to recover attribution, but outside of the
            # the current retention period
            {
                "client_id": "4",
                "environment": update_attribution({"source": "mozilla.org"}),
                "submission": SUBSESSION_START.shift(days=-7).format("YYYYMMDD"),
            },
            # avoid accidentally overwriting an existing value with an empty structure
            {"client_id": "5", "environment": update_attribution({})},
            # main-pings have higher latency than new-profile pings, so the main
            # ping attribution state will be set correctly
            {"client": "6", "environment": update_attribution(None)},
            # new-profile timestamp is newer than main-ping, so attribution for the
            # client is unset
            {
                "client_id": "7",
                "environment": update_attribution(None),
                "timestamp": SUBSESSION_START.shift(days=1).timestamp * 10 ** 9,
            },
        ]
    )
    sources = job.extract(main_summary, new_profile, WEEK_START_DS, 2, 0, False)
    df = job.transform(sources, effective_version, WEEK_START_DS)

    assert df.where("source='mozilla.org'").agg(F.sum("n_profiles")).first()[0] == 6
예제 #7
0
def test_attribution_from_new_profile(effective_version,
                                      generate_main_summary_data,
                                      generate_new_profile_data):
    main_summary = generate_main_summary_data([
        {
            'client_id': '1',
            'attribution': {
                'source': 'mozilla.org'
            }
        },
        {
            'client_id': '3',
            'attribution': None
        },
        {
            'client_id': '4',
            'attribution': None
        },
        {
            'client_id': '5',
            'attribution': {
                'source': 'mozilla.org'
            },
            'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9,
        },
        {
            'client_id': '6',
            'attribution': {
                'source': 'mozilla.org'
            },
            'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9,
        },
        {
            'client_id': '7',
            'attribution': {
                'source': 'mozilla.org'
            }
        },
    ])

    def update_attribution(attribution):
        # only updates the attribution section in the environment
        env = copy.deepcopy(data.new_profile_sample['environment'])
        env['settings']['attribution'] = attribution
        return env

    new_profile = generate_new_profile_data([
        # new profile without a main summary companion
        {
            'client_id': '2',
            'environment': update_attribution({'source': 'mozilla.org'})
        },
        # recover null attribution
        {
            'client_id': '3',
            'environment': update_attribution({'source': 'mozilla.org'})
        },
        # new-profile ping used to recover attribution, but outside of the
        # the current retention period
        {
            'client_id': '4',
            'environment': update_attribution({'source': 'mozilla.org'}),
            'submission': SUBSESSION_START.shift(days=-7).format("YYYYMMDD"),
        },
        # avoid accidentally overwriting an existing value with an empty structure
        {
            'client_id': '5',
            'environment': update_attribution({})
        },
        # main-pings have higher latency than new-profile pings, so the main
        # ping attribution state will be set correctly
        {
            'client': '6',
            'environment': update_attribution(None)
        },
        # new-profile timestamp is newer than main-ping, so attribution for the
        # client is unset
        {
            'client_id': '7',
            'environment': update_attribution(None),
            'timestamp': SUBSESSION_START.shift(days=1).timestamp * 10**9,
        },
    ])
    sources = job.extract(main_summary, new_profile, WEEK_START_DS, 2, 0,
                          False)
    df = job.transform(sources, effective_version, WEEK_START_DS)

    assert df.where("source='mozilla.org'").agg(
        F.sum("n_profiles")).first()[0] == 6