def run_agg_jm(date, ctx, sql_context, verbose=False): """ Runs aggregation for JobMonitoring stream for a certain date. Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream. Result dataframe is sorted by nacc. """ print('Starting JobMonitoring part') # Make a UNIX timestamp from date unix_date = short_date_to_unix(short_date_string(date)) # Convert date date = long_date_string(date) # Create JobMonitoring tables in sql_context jm_df = jm_tables(ctx, sql_context, date=date, verbose=verbose) if verbose: print('Found %s records in JobMonitoring stream' % jm_df['jm_df'].count()) # - site name + # - dataset name + # - app + # - uid + # - dn + # - number of access (nacc) + # - distinct users + # - stream: crab + # - timestamp + # - site tier + # - cpu time + cols = [ 'SiteName AS site_name', 'dataset_name', 'count(dataset_name) AS nacc', 'count(distinct(UserId)) AS distinct_users', '\"crab\" AS stream', '%s AS timestamp' % unix_date, 'first(tier_from_site_name(SiteName)) AS site_tier', 'SUM(WrapCPU) AS cpu_time' ] # Build a query with "cols" columns # query = ("SELECT %s FROM jm_df "\ # "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName "\ # "GROUP BY jm_df.SiteName, dataset_name") \ cols = [ 'SiteName AS site_name', 'dataset_name', 'stream4app(jm_df.SubmissionTool) AS app', 'dn2uuid(GridName) AS uid', 'parse_dn(GridName) AS dn', '\"crab\" AS stream', '%s AS timestamp' % unix_date, 'WrapCPU AS cpu', 'WrapWC as wc' ] query = "SELECT %s FROM jm_df "\ "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName " \ % ','.join(cols) cols = [ 'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid', 'first(stream) as stream', 'first(timestamp) as timestamp', 'count(dataset_name) AS nacc', 'count(dn) AS distinct_users', 'first(tier_from_site_name(site_name)) AS site_tier', 'SUM(cpu) AS cpu_time', 'SUM(wc) AS wc_time' ] query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \ % (','.join(cols), query) result = run_query(query, sql_context, verbose) # result = result.sort(desc("nacc")) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset_col(result, 'dataset_name') if verbose: print('Finished JobMonitoring part (output is %s records)' % result.count()) else: print('Finished JobMonitoring part') return result
def run_agg_cmssw(date, ctx, sql_context, verbose=False): """ Runs aggregation for CMSSW stream for a certain date. Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream. Result dataframe is sorted by nacc. """ print('Starting CMSSW part') # Make a UNIX timestamp from date unix_date = short_date_to_unix(short_date_string(date)) # Convert date date = long_date_string(date) # Create CMSSW tables in sql_context cmssw_df = cmssw_tables(ctx, sql_context, date=date, verbose=verbose) if verbose: print('Found %s records in CMSSW stream' % cmssw_df['cmssw_df'].count()) # - site name + # - dataset name + # - app + # - uid + # - dn + # - number of access (nacc) + # - distinct users + # - stream: cmssw + # - timestamp + # - site tier + # - cpu time -1 cols = [ 'cmssw_df.SITE_NAME AS site_name', 'dataset_name', 'count(dataset_name) AS nacc', 'count(distinct(USER_DN)) AS distinct_users', '\"cmssw\" as stream', '%s AS timestamp' % unix_date, 'first(tier_from_site_name(cmssw_df.SITE_NAME)) AS site_tier', '-1 AS cpu_time' ] # Build a query with "cols" columns # query = ("SELECT %s FROM cmssw_df "\ # "JOIN f_b_s_df ON f_b_s_df.file_name = cmssw_df.FILE_LFN "\ # "GROUP BY cmssw_df.SITE_NAME, dataset_name") \ # % ','.join(cols) cols = [ 'cmssw_df.SITE_NAME AS site_name', 'dataset_name', 'parse_app(cmssw_df.APP_INFO) AS app', 'dn2uuid(cmssw_df.USER_DN) AS uid', 'parse_dn(cmssw_df.USER_DN) AS dn', 'stream4app(cmssw_df.APP_INFO) as stream', '%s AS timestamp' % unix_date, '-1 AS cpu' ] query = "SELECT %s FROM cmssw_df "\ "JOIN f_b_s_df ON f_b_s_df.file_name = cmssw_df.FILE_LFN " \ % ','.join(cols) cols = [ 'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid', 'first(stream) as stream', 'first(timestamp) as timestamp', 'count(dataset_name) AS nacc', 'count(dn) AS distinct_users', 'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time', '-1 AS wc_time' ] query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \ % (','.join(cols), query) result = run_query(query, sql_context, verbose) # result = result.sort(desc("nacc")) # Split "dataset" column into "primds", "procds" and "tier" result = split_dataset_col(result, 'dataset_name') if verbose: print('Finished CMSSW part (output is %s records)' % result.count()) else: print('Finished CMSSW part') return result