Exemplo n.º 1
0
def create_file_block_site_table(ctx, sql_context, verbose=False):
    """
    Joins fdf, bdf, ddf and PhEDEx tables and produces table with file name, block name, dataset name and site name.
    Site name is obtained from PhEDEx. Before site name is used, it is cleaned with clean_site_name function.
    After join is complete, only unique records are left in the table by using DISTINCT function.
    """
    print('Starting file_block_site generation')

    cols = [
        'f_logical_file_name AS file_name', 'b_block_name AS block_name',
        'clean_site_name(node_name) AS site_name', 'd_dataset AS dataset_name'
    ]

    # Join FDF and BDF by f_block_id and b_block_id
    query = ("SELECT %s FROM fdf " \
             "JOIN bdf ON fdf.f_block_id = bdf.b_block_id "\
             "JOIN ddf ON fdf.f_dataset_id = ddf.d_dataset_id "\
             "JOIN phedex_df ON bdf.b_block_name = phedex_df.block_name") % ','.join(cols)

    if verbose:
        print('Will run query to generate temp file_block_site table')

    result = run_query(query, sql_context, verbose)
    result.registerTempTable('f_b_all_df')

    query_distinct = ("SELECT DISTINCT * FROM f_b_all_df ORDER  BY file_name")
    result_distinct = run_query(query_distinct, sql_context, verbose)
    result_distinct.registerTempTable('f_b_s_df')

    if verbose:
        print('Temp table from joined DDF, FDF, BDF and PhEDEx')
        print('After DISTINCT query count changed %s -> %s' %
              (result.count(), result_distinct.count()))
        result_distinct.show(20)
        print_rows(result_distinct, query_distinct, verbose, 5)
        result_distinct.printSchema()

    print('Finished file_block_site generation')
Exemplo n.º 2
0
def run_agg_jm(date, ctx, sql_context, verbose=False):
    """
    Runs aggregation for JobMonitoring stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Result dataframe is sorted by nacc.
    """
    print('Starting JobMonitoring part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = long_date_string(date)

    # Create JobMonitoring tables in sql_context
    jm_df = jm_tables(ctx, sql_context, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in JobMonitoring stream' %
              jm_df['jm_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: crab             +
    # - timestamp                +
    # - site tier                +
    # - cpu time                 +

    cols = [
        'SiteName AS site_name', 'dataset_name', 'count(dataset_name) AS nacc',
        'count(distinct(UserId)) AS distinct_users', '\"crab\" AS stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(SiteName)) AS site_tier',
        'SUM(WrapCPU) AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM jm_df "\
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName "\
    #             "GROUP BY jm_df.SiteName, dataset_name") \
    cols = [
        'SiteName AS site_name', 'dataset_name',
        'stream4app(jm_df.SubmissionTool) AS app', 'dn2uuid(GridName) AS uid',
        'parse_dn(GridName) AS dn', '\"crab\" AS stream',
        '%s AS timestamp' % unix_date, 'WrapCPU AS cpu', 'WrapWC as wc'
    ]
    query = "SELECT %s FROM jm_df "\
             "JOIN f_b_s_df ON f_b_s_df.file_name = jm_df.FileName " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier',
        'SUM(cpu) AS cpu_time', 'SUM(wc) AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished JobMonitoring part (output is %s records)' %
              result.count())
    else:
        print('Finished JobMonitoring part')

    return result
Exemplo n.º 3
0
def run_agg_aaa(date,
                ctx,
                sql_context,
                hdir='hdfs:///project/monitoring/archive/xrootd/enr/gled',
                verbose=False):
    """
    Runs aggregation for JobMonitoring stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Data is taken from /project/monitoring/archive/xrootd/enr/gled and not the default location (raw instead of enr)
    because enr records have src_experiment_site. src_experiment_site is used as site_name.
    Result dataframe is sorted by nacc.
    """
    print('Starting AAA part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = short_date_string(date)

    # Create AAA tables in sql_context
    aaa_df = aaa_tables_enr(sql_context, hdir=hdir, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in AAA stream' % aaa_df['aaa_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: aaa              +
    # - timestamp                +
    # - site tier                +
    # - cpu time                -1

    cols = [
        'src_experiment_site AS site_name', 'dataset_name',
        'count(dataset_name) AS nacc',
        'count(distinct(aaa_df.user_dn)) AS distinct_users',
        '\"aaa\" as stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(src_experiment_site)) AS site_tier',
        '-1 AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM aaa_df " \
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = aaa_df.file_lfn " \
    #             "GROUP BY src_experiment_site, dataset_name") \
    #             % ','.join(cols)
    cols = [
        'src_experiment_site AS site_name', 'dataset_name',
        '\"xrootd\" AS app', 'dn2uuid(aaa_df.user_dn) AS uid',
        'parse_dn(aaa_df.user_dn) AS dn', '\"aaa\" as stream',
        '%s AS timestamp' % unix_date, '-1 AS cpu'
    ]
    query = "SELECT %s FROM aaa_df " \
             "JOIN f_b_s_df ON f_b_s_df.file_name = aaa_df.file_lfn " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time',
        '-1 AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished AAA part (output is %s records)' % result.count())
    else:
        print('Finished AAA part')

    return result
Exemplo n.º 4
0
def run_agg_cmssw(date, ctx, sql_context, verbose=False):
    """
    Runs aggregation for CMSSW stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Result dataframe is sorted by nacc.
    """
    print('Starting CMSSW part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = long_date_string(date)

    # Create CMSSW tables in sql_context
    cmssw_df = cmssw_tables(ctx, sql_context, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in CMSSW stream' %
              cmssw_df['cmssw_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: cmssw            +
    # - timestamp                +
    # - site tier                +
    # - cpu time                -1

    cols = [
        'cmssw_df.SITE_NAME AS site_name', 'dataset_name',
        'count(dataset_name) AS nacc',
        'count(distinct(USER_DN)) AS distinct_users', '\"cmssw\" as stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(cmssw_df.SITE_NAME)) AS site_tier',
        '-1 AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM cmssw_df "\
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = cmssw_df.FILE_LFN "\
    #             "GROUP BY cmssw_df.SITE_NAME, dataset_name") \
    #             % ','.join(cols)
    cols = [
        'cmssw_df.SITE_NAME AS site_name', 'dataset_name',
        'parse_app(cmssw_df.APP_INFO) AS app',
        'dn2uuid(cmssw_df.USER_DN) AS uid', 'parse_dn(cmssw_df.USER_DN) AS dn',
        'stream4app(cmssw_df.APP_INFO) as stream',
        '%s AS timestamp' % unix_date, '-1 AS cpu'
    ]
    query = "SELECT %s FROM cmssw_df "\
             "JOIN f_b_s_df ON f_b_s_df.file_name = cmssw_df.FILE_LFN " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time',
        '-1 AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished CMSSW part (output is %s records)' % result.count())
    else:
        print('Finished CMSSW part')

    return result
Exemplo n.º 5
0
def run_agg_eos(date, ctx, sql_context, verbose=False):
    """
    Runs aggregation for EOS stream for a certain date.
    Function produces a dataframe that contains site name, dataset name, number of access, distinct users and stream.
    Site name is taken from f_b_s_df table which is joined by file name.
    Result dataframe is sorted by nacc.
    """
    print('Starting EOS part')

    # Make a UNIX timestamp from date
    unix_date = short_date_to_unix(short_date_string(date))

    # Convert date
    date = short_date_string(date)

    # Create EOS tables in sql_context
    eos_df = eos_tables(sql_context, date=date, verbose=verbose)

    if verbose:
        print('Found %s records in EOS stream' % eos_df['eos_df'].count())

    # - site name                +
    # - dataset name             +
    # - app                      +
    # - uid                      +
    # - dn                       +
    # - number of access (nacc)  +
    # - distinct users           +
    # - stream: eos              +
    # - timestamp                +
    # - site tier                +
    # - cpu time                -1

    cols = [
        'site_name', 'dataset_name', 'count(dataset_name) AS nacc',
        'count(distinct(eos_df.user_dn)) AS distinct_users',
        '\"eos\" as stream',
        '%s AS timestamp' % unix_date,
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time'
    ]

    # Build a query with "cols" columns
    #    query = ("SELECT %s FROM eos_df " \
    #             "JOIN f_b_s_df ON f_b_s_df.file_name = eos_df.file_lfn " \
    #             "GROUP BY site_name, dataset_name") \
    #             % ','.join(cols)
    cols = [
        'site_name', 'dataset_name', 'parse_app(eos_df.application) AS app',
        'dn2uuid(eos_df.user_dn) AS uid', 'parse_dn(eos_df.user_dn) AS dn',
        '\"eos\" as stream',
        '%s AS timestamp' % unix_date, '-1 AS cpu'
    ]
    query = "SELECT %s FROM eos_df " \
             "JOIN f_b_s_df ON f_b_s_df.file_name = eos_df.file_lfn " \
             % ','.join(cols)
    cols = [
        'dn', 'dataset_name', 'site_name', 'app', 'first(uid) as uid',
        'first(stream) as stream', 'first(timestamp) as timestamp',
        'count(dataset_name) AS nacc', 'count(dn) AS distinct_users',
        'first(tier_from_site_name(site_name)) AS site_tier', '-1 AS cpu_time',
        '-1 AS wc_time'
    ]
    query = "SELECT %s FROM (%s) QUERY1 GROUP BY dn, dataset_name, site_name, app" \
            % (','.join(cols), query)

    result = run_query(query, sql_context, verbose)

    # result = result.sort(desc("nacc"))

    # Split "dataset" column into "primds", "procds" and "tier"
    result = split_dataset_col(result, 'dataset_name')

    if verbose:
        print('Finished EOS part (output is %s records)' % result.count())
    else:
        print('Finished EOS part')

    return result