def test_last_week_range(): def test_week(week, startday_num, endday_num): # Did we get a Sunday as the beginning of the week... assert week[0].weekday() == startday_num # ... and a Saturday as the end of the week? assert week[1].weekday() == endday_num # Is this a full week spanning for exactly 7 days? delta = week[1] - week[0] assert delta.days == 6 # Check if that's the closest full week. We monkey patched the 3rd July 2015 # as "today". The previous full week, starting on Sunday, begins on the 21st # June 2015. If the first weekday is Monday, then on the 22nd of July. expected_dates = (date(2015, 6, 21), date(2015, 6, 27)) if\ startday_num == 6 else (date(2015, 6, 22), date(2015, 6, 28)) assert week[0] == expected_dates[0] assert week[1] == expected_dates[1] with patch('moztelemetry.standards.date') as mock_date: # Mock date.today() to return a specific day, so we can properly test. mock_date.today.return_value = date(2015, 7, 3) mock_date.side_effect = lambda *args, **kw: date(*args, **kw) # Get the start and end date for the previous full week, as a tuple, and make sure # it's valid. prev_week = moz_utils.get_last_week_range("Sunday") test_week(prev_week, 6, 5) # As before, with a week starting with Monday. prev_week = moz_utils.get_last_week_range("Monday") test_week(prev_week, 0, 6)
def test_last_week_range(): def test_week(week, startday_num, endday_num): # Did we get a Sunday as the beginning of the week... assert week[0].weekday() == startday_num # ... and a Saturday as the end of the week? assert week[1].weekday() == endday_num # Is this a full week spanning for exactly 7 days? delta = week[1] - week[0] assert delta.days == 6 # Check if that's the closest full week. We monkey patched the 3rd July 2015 # as "today". The previous full week, starting on Sunday, begins on the 21st # June 2015. If the first weekday is Monday, then on the 22nd of July. expected_dates = (date(2015, 6, 21), date(2015, 6, 27)) if\ startday_num == 6 else (date(2015, 6, 22), date(2015, 6, 28)) assert week[0] == expected_dates[0] assert week[1] == expected_dates[1] with patch('moztelemetry.standards.date') as mock_date: # Mock date.today() to return a specific day, so we can properly test. mock_date.today.return_value = date(2015, 7, 3) mock_date.side_effect = lambda *args, **kw: date(*args, **kw) # Get the start and end date for the previous full week, as a tuple, and make sure # it's valid. prev_week = moz_utils.get_last_week_range("Sunday") test_week(prev_week, 6, 5) # As before, with a week starting with Monday. prev_week = moz_utils.get_last_week_range("Monday") test_week(prev_week, 0, 6)
def generate_report(start_date, end_date, spark, spark_provider="emr"): """Generate the hardware survey dataset for the reference timeframe. If the timeframe is longer than a week, split it in in weekly chunks and process each chunk individually (eases backfilling). The report for each week is saved in a local JSON file. Args: start_date: The date from which we start generating the report. If None, the report starts from the beginning of the past week (Sunday). end_date: The date the marks the end of the reporting period. This only makes sense if a |start_date| was provided. If None, this defaults to the end of the past week (Saturday). spark: SparkSession. spark_provider: Environment the application is running in. For `emr`, Longitudinal will be used, on `dataproc` data will be loaded from `telemetry.main`. """ # If no start_date was provided, generate a report for the past complete # week. last_week = moz_std.get_last_week_range() date_range = ( moz_std.snap_to_beginning_of_week(start_date, "Sunday") if start_date is not None else last_week[0], end_date if (end_date is not None and start_date is not None) else last_week[1], ) # Split the submission period in chunks, so we don't run out of resources while aggregating if # we want to backfill. chunk_start = date_range[0] chunk_end = None # Stores all hardware reports in json by date date_to_json = {} while chunk_start < date_range[1]: chunk_end = chunk_start + dt.timedelta(days=6) (filtered_data, broken_ratio, inactive_ratio) = (get_data_longitudinal( spark, chunk_start, chunk_end) if spark_provider is "emr" else get_data_bigquery(spark, chunk_start, chunk_end)) # Process the data, transforming it in the form we desire. device_map = build_device_map() processed_data = filtered_data.map( lambda d: prepare_data(d, device_map)) logger.info("Aggregating entries...") aggregated_pings = aggregate_data(processed_data) # Get the sample count, we need it to compute the percentages instead of raw numbers. # Since we're getting only the newest ping for each client, we can simply count the # number of pings. THIS MAY NOT BE CONSTANT ACROSS WEEKS! valid_records_count = filtered_data.count() # Collapse together groups that count less than 1% of our samples. threshold_to_collapse = int(valid_records_count * 0.01) logger.info( "Collapsing smaller groups into the other bucket (threshold {th})". format(th=threshold_to_collapse)) collapsed_aggregates = collapse_buckets(aggregated_pings, threshold_to_collapse) logger.info("Post-processing raw values...") processed_aggregates = finalize_data( collapsed_aggregates, valid_records_count, broken_ratio, inactive_ratio, chunk_start, ) if not validate_finalized_data(processed_aggregates): raise Exception("The aggregates failed to validate.") # Write the week start/end in the filename. suffix = ("-" + chunk_start.strftime("%Y%d%m") + "-" + chunk_end.strftime("%Y%d%m")) file_name = get_file_name(suffix) date_to_json[file_name] = processed_aggregates # Move on to the next chunk, just add one day the end of the last # chunk. chunk_start = chunk_end + dt.timedelta(days=1) return date_to_json
def generate_report(start_date, end_date, spark): """Generate the hardware survey dataset for the reference timeframe. If the timeframe is longer than a week, split it in in weekly chunks and process each chunk individually (eases backfilling). The report for each week is saved in a local JSON file. Args: start_date: The date from which we start generating the report. If None, the report starts from the beginning of the past week (Sunday). end_date: The date the marks the end of the reporting period. This only makes sense if a |start_date| was provided. If None, this defaults to the end of the past week (Saturday). """ # If no start_date was provided, generate a report for the past complete # week. last_week = moz_std.get_last_week_range() date_range = ( moz_std.snap_to_beginning_of_week(start_date, "Sunday") if start_date is not None else last_week[0], end_date if (end_date is not None and start_date is not None) else last_week[1], ) # Split the submission period in chunks, so we don't run out of resources while aggregating if # we want to backfill. chunk_start = date_range[0] chunk_end = None # Stores all hardware reports in json by date date_to_json = {} while chunk_start < date_range[1]: chunk_end = chunk_start + dt.timedelta(days=6) longitudinal_version = get_longitudinal_version(chunk_end) sqlQuery = """ SELECT build, client_id, active_plugins, system_os, submission_date, system, system_gfx, system_cpu, normalized_channel FROM {} WHERE normalized_channel = 'release' AND build is not null and build[0].application_name = 'Firefox' """.format(longitudinal_version) frame = spark.sql(sqlQuery) # The number of all the fetched records (including inactive and broken). records_count = frame.count() logger.info("Total record count for {}: {}".format( chunk_start.strftime("%Y%m%d"), records_count)) # Fetch the data we need. data = frame.rdd.map( lambda r: get_latest_valid_per_client(r, chunk_start, chunk_end)) # Filter out broken data. filtered_data = data.filter( lambda r: r not in [REASON_BROKEN_DATA, REASON_INACTIVE]) # Count the broken records and inactive records. discarded = data.filter( lambda r: r in [REASON_BROKEN_DATA, REASON_INACTIVE]).countByValue( ) broken_count = discarded[REASON_BROKEN_DATA] inactive_count = discarded[REASON_INACTIVE] broken_ratio = broken_count / float(records_count) inactive_ratio = inactive_count / float(records_count) logger.info( "Broken pings ratio: {}; Inactive clients ratio: {}".format( broken_ratio, inactive_ratio)) # If we're not seeing sane values for the broken or inactive ratios, # bail out early on. There's no point in aggregating. if broken_ratio >= 0.9 or inactive_ratio >= 0.9: raise Exception( "Unexpected ratio of broken pings or inactive clients. Broken ratio: {0},\ inactive ratio: {1}".format(broken_ratio, inactive_ratio)) # Process the data, transforming it in the form we desire. device_map = build_device_map() processed_data = filtered_data.map( lambda d: prepare_data(d, device_map)) logger.info("Aggregating entries...") aggregated_pings = aggregate_data(processed_data) # Get the sample count, we need it to compute the percentages instead of raw numbers. # Since we're getting only the newest ping for each client, we can simply count the # number of pings. THIS MAY NOT BE CONSTANT ACROSS WEEKS! valid_records_count = filtered_data.count() # Collapse together groups that count less than 1% of our samples. threshold_to_collapse = int(valid_records_count * 0.01) logger.info( "Collapsing smaller groups into the other bucket (threshold {th})". format(th=threshold_to_collapse)) collapsed_aggregates = collapse_buckets(aggregated_pings, threshold_to_collapse) logger.info("Post-processing raw values...") processed_aggregates = finalize_data( collapsed_aggregates, valid_records_count, broken_ratio, inactive_ratio, chunk_start, ) if not validate_finalized_data(processed_aggregates): raise Exception("The aggregates failed to validate.") # Write the week start/end in the filename. suffix = ("-" + chunk_start.strftime("%Y%d%m") + "-" + chunk_end.strftime("%Y%d%m")) file_name = get_file_name(suffix) date_to_json[file_name] = processed_aggregates # Move on to the next chunk, just add one day the end of the last # chunk. chunk_start = chunk_end + dt.timedelta(days=1) return date_to_json