def test_all_(spark): df = spark.createDataFrame( data=[ [True, False, False], [True, False, True], [True, False, False], [True, False, True], ], schema=["all_true", "all_false", "mixed"], ) pres = df.select( all_([df.all_true, df.all_true, df.all_true]).alias("true_1"), all_([df.all_true, df.all_false]).alias("false_2"), all_([df.all_false, df.all_false]).alias("false_3"), all_([df.mixed, df.all_false]).alias("false_4"), all_([df.mixed, df.all_true]).alias("mixed_5"), ).toPandas() assert pres.shape == (4, 5) assert not pres.isnull().any().any() assert pres["true_1"].all() assert not pres["false_2"].any() assert not pres["false_3"].any() assert not pres["false_4"].any() assert not pres["mixed_5"][::2].any() assert pres["mixed_5"][1::2].all() # Check this workaround is still necessary: with pytest.raises(ValueError): all([df.all_true, df.all_true])
def make_select_col(event, metric_key): """Return an int Column named `metric_key`, given an event Column. In the returned Column, a row is 1 if the event is `metric_key`, otherwise it is 0. Useful when trying to count the number of occurrences of an event - just sum this column. Example usage: from pyspark.sql import functions as F t = spark.table('telemetry_mobile_event_parquet') t2 = t.filter( t.submission_date_s3 == '20190101' ).select( F.explode(t.events).alias('event') ) t3 = t2.select( make_select_col(t2.event, 'session_start'), make_select_col(t2.event, 'session_end') ) t3.agg(F.sum(t3.session_start), F.sum(t3.session_end)).collect() """ return all_( event[k] == v for (k, v) in metric_library[metric_key].items() ).astype('int').alias(metric_key)
def pocket_video_clicks(fe): return F.sum(all_([ fe.event.category == 'action', fe.event.method == 'click', fe.event.object == 'menu', fe.event.value == 'pocket_video_tile' ]).astype('int')),
def unenroll(events, experiment): return agg_any( all_([ events.event_category == 'normandy', events.event_method == 'unenroll', events.event_string_value == experiment.experiment_slug, ]))
def tracking_protection_toggle_off(fe): return F.sum(all_([ fe.event.category == 'action', fe.event.method == 'change', fe.event.object == 'turbo_mode', fe.event.value == 'off' ]).astype('int')),
def remote_backs(fe): return F.sum(all_([ fe.event.category == 'action', fe.event.method == 'page', fe.event.object == 'browser', fe.event.value == 'back' ]).astype('int')),
def browser_backs(fe): return F.sum(all_([ fe.event.category == 'action', fe.event.method == 'click', fe.event.object == 'menu', fe.event.value == 'back' ]).astype('int')),
def bundled_non_youtube_tile_clicks(fe): return F.sum(all_([ fe.event.category == 'action', fe.event.method == 'click', fe.event.object == 'home_tile', fe.event.value == 'bundled', fe.event.extra['tile_id'] != 'youtube', ]).astype('int')),
def home_tile_clicks(fe): return F.sum(all_([ fe.event.category == 'action', fe.event.method == 'click', fe.event.object == 'home_tile', # Otherwise youtube is double counted (per liuche 2019/06/07) :'( fe.event.value != 'youtube_tile' ]).astype('int')),
def anything_but_youtube_tile_clicks(fe): return F.sum(all_([ fe.event.category == 'action', fe.event.method == 'click', fe.event.object == 'home_tile', fe.event.value != 'youtube_tile', ( F.isnull(fe.event.extra['tile_id']) | (fe.event.extra['tile_id'] != 'youtube') ) ]).astype('int')),
def navigates_or_clicks_not_youtube(fe): return F.sum(any_([ all_([ fe.event.category == 'action', fe.event.method == 'click', fe.event.object == 'home_tile', fe.event.value != 'youtube_tile', ( F.isnull(fe.event.extra['tile_id']) | (fe.event.extra['tile_id'] != 'youtube') ) ]), all_([ fe.event.category == 'action', fe.event.method == 'type_url', fe.event.object == 'search_bar', ]), all_([ fe.event.category == 'action', fe.event.method == 'type_query', fe.event.object == 'search_bar', ]) ]).astype('int')),
def make_where(event, metric_key): """Return a bool Column named `metric_key`, given an event Column. In the returned Column, a row is True iff the event is `metric_key`. Useful when filtering for an event. Example usage: t = spark.table('telemetry_mobile_event_parquet') t2 = t.filter( t.submission_date_s3 == '20190101' ).select( F.explode(t.events).alias('event') ) t3 = t2.filter(make_where(t2.event, 'session_start')) """ return all_( event[k] == v for (k, v) in metric_library[metric_key].items() ).alias(metric_key)
def view_about_protections(events): return agg_any( all_([ events.event_object == 'protection_report', events.event_method == 'show', ]))
def view_about_logins(events): return agg_any( all_([ events.event_method == 'open_management', events.event_category == 'pwmgr', ]))
def type_queries(fe): return F.sum(all_([ fe.event.category == 'action', fe.event.method == 'type_query', fe.event.object == 'search_bar', ]).astype('int')),
def user_show_menus(fe): return F.sum(all_([ fe.event.category == 'action', fe.event.method == 'user_show', fe.event.object == 'menu', ]).astype('int')),