def GetCachedDataset(): """Load the latest dataset with cached data.""" local_path = CachedFilePath(DATASET_PKL_FILE) if os.path.exists(local_path) or DownloadFromCloudStorage(local_path): return pd.read_pickle(local_path) else: return None
def GetWithCache(filename, frame_maker, expires_after): """Get a data frame from cache or, if necessary, create and cache it. Args: filename: The name of a file for the cached copy of the data frame, it will be stored in the CACHE_DIR. frame_maker: A function that takes no arguments and returns a data frame, only called to create the data frame if the cached copy does not exist or is too old. expires_after: A datetime.timedelta object, the cached copy will not be used if it was created longer that this time ago. """ filepath = os.path.join(CACHE_DIR, filename) try: timestamp = os.path.getmtime(filepath) last_modified = datetime.datetime.utcfromtimestamp(timestamp) expired = datetime.datetime.utcnow() > last_modified + expires_after except OSError: # If the file does not exist. expired = True if expired: df = frame_maker() if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) df.to_pickle(filepath) else: df = pandas.read_pickle(filepath) return df
def AggregateAndUploadResults(state): """Aggregate results collected and upload them to cloud storage.""" cached_results = CachedFilePath(DATASET_PKL_FILE) dfs = [] keep_revisions = set(item['revision'] for item in state) if os.path.exists(cached_results): # To speed things up, we take the cache computed from previous results. df = pd.read_pickle(cached_results) # Drop possible old data from revisions no longer in recent state. df = df[df['revision'].isin(keep_revisions)] dfs.append(df) known_revisions = set(df['revision']) else: known_revisions = set() found_new = False for item in state: if item['revision'] in known_revisions or _SkipProcessing(item): # Revision is already in cache, jobs are not ready, or all have failed. continue if not found_new: logging.info('Processing data from new results:') found_new = True logging.info('- %s (%s)', item['timestamp'][:10], item['revision']) dfs.append(GetRevisionResults(item)) if not found_new: logging.info('No new data found.') return # Otherwise update our cache and upload. df = pd.concat(dfs, ignore_index=True) df.to_pickle(cached_results) # Drop revisions with no results and mark the last result for each metric, # both with/without patch, as a 'reference'. This allows making score cards # comparing their most recent results in Data Studio dashboards. df = df[df['count'] > 0].copy() latest_result = df.groupby( ['label', 'benchmark', 'name'])['timestamp'].transform('max') df['reference'] = df['timestamp'] == latest_result dataset_file = CachedFilePath(DATASET_CSV_FILE) df.to_csv(dataset_file, index=False) UploadToCloudStorage(dataset_file) logging.info('Total %s rows of data uploaded.' % len(df.index))