def PostProcess(df): # Snap stories on the same test run to the same timestamp. df['timestamp'] = df.groupby(['test_suite', 'bot', 'point_id'])['timestamp'].transform('min') # Prevent the size of the output from growing without bounts. Limit for # DataStudio input appears to be around 100MiB. four_months_ago = pandas.Timestamp.utcnow() - pandas.DateOffset(months=4) df = df[df['timestamp'] > four_months_ago.tz_convert(None)].copy() # We use all runs on the latest day for each quarter as reference. df['quarter'] = df['timestamp'].dt.to_period('Q') df['reference'] = df['timestamp'].dt.date == df.groupby( ['quarter', 'test_suite', 'bot'])['timestamp'].transform('max').dt.date # Change units for values in ms to seconds, and percent values. df['units'] = df['units'].fillna('') is_ms_unit = df['units'].str.startswith('ms_') df.loc[is_ms_unit, 'value'] = df['value'] / 1000 is_percentage = df['units'].str.startswith('n%_') df.loc[is_percentage, 'value'] = df['value'] * 100 # Remove unused columns to save space in the output csv. for col in ('point_id', 'chromium_rev', 'clank_rev', 'trace_url'): del df[col] return df
def GetRevisionResults(item): """Aggregate the results from jobs that ran on a particular revision.""" # First load pinpoint csv results into a DataFrame. The dtype arg is needed # to ensure that job_id's are always read a strings (even if some of them # look like large numbers). df = pd.read_csv(RevisionResultsFile(item), dtype={'job_id': str}) assert df['change'].str.contains(item['revision']).all(), ( 'Not all results match the expected git revision') # Filter out and keep only the measurements and stories that we want. df = df[df['name'].isin(MEASUREMENTS)] df = df[df['story'].isin(ACTIVE_STORIES)] if not df.empty: # Aggregate over the results of individual stories. df = df.groupby(['change', 'name', 'benchmark', 'unit'])['mean'].agg(['mean', 'count']).reset_index() else: # Otherwise build a single row with an "empty" aggregate for this revision. # This is needed so we can remember in the cache that this revision has # been processed. df = pd.DataFrame(index=[0]) df['change'] = item['revision'] df['name'] = '(missing)' df['benchmark'] = '(missing)' df['unit'] = '' df['mean'] = np.nan df['count'] = 0 # Convert time units from milliseconds to seconds. This is what Data Studio # dashboards expect. is_ms_unit = df['unit'].str.startswith('ms_') df.loc[is_ms_unit, 'mean'] = df['mean'] / 1000 # Distinguish jobs that ran with/without the tested patch. df['label'] = df['change'].str.contains(r'\+').map({ False: 'without_patch', True: 'with_patch' }) # Add timestamp and revision information. We snap the date to noon and make # it naive (i.e. no timezone), so the dashboard doesn't get confused with # dates close to the end of day. date = item['timestamp'].split('T')[0] + 'T12:00:00' df['timestamp'] = pd.Timestamp(date) df['revision'] = item['revision'] # Fake the timestamp of jobs without the patch to appear as if they ran a # year ago; this makes it easier to visualize and compare timeseries from # runs with/without the patch in Data Studio dashboards. df.loc[df['label'] == 'without_patch', 'timestamp'] = (df['timestamp'] - pd.DateOffset(years=1)) return df[[ 'revision', 'timestamp', 'label', 'benchmark', 'name', 'mean', 'count' ]]
def TimeAgo(**kwargs): return pd.Timestamp.now(TZ) - pd.DateOffset(**kwargs)
def Yesterday(): return pd.Timestamp.now(TZ) - pd.DateOffset(days=1)