Пример #1
0
def run(date, fout, yarn=None, verbose=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate
    # read Phedex tables
    tables = {}
    tables.update(
        phedex_tables(sqlContext,
                      verbose=verbose,
                      fromdate=fromdate,
                      todate=todate))
    phedex_df = tables['phedex_df']

    # register user defined function
    unix2date = udf(unix2human, StringType())
    siteFilter = udf(site_filter, IntegerType())
    one_day = 60 * 60 * 24

    # aggregate phedex info into dataframe
    cols = [
        'node_name', 'dataset_name', 'block_bytes', 'replica_time_create',
        'br_user_group_id'
    ]
    pdf = phedex_df.select(cols).where(siteFilter(col('node_name')) == 1)\
            .groupBy(['node_name', 'dataset_name', 'replica_time_create', 'br_user_group_id'])\
            .agg({'block_bytes':'sum'})\
            .withColumn('date', lit(date))\
            .withColumn('replica_date', unix2date(col('replica_time_create')))\
            .withColumnRenamed('sum(block_bytes)', 'size')\
            .withColumnRenamed('dataset_name', 'dataset')\
            .withColumnRenamed('node_name', 'site')\
            .withColumnRenamed('br_user_group_id', 'groupid')
    pdf.registerTempTable('pdf')
    pdf.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        year, month, day = split_date(date)
        out = '%s/%s/%s/%s' % (fout, year, month, day)
        cols = ['date', 'site', 'dataset', 'size', 'replica_date', 'groupid']
        # don't write header since when we'll read back the data it will
        # mismatch the data types, i.e. headers are string and rows
        # may be different data types
        pdf.select(cols)\
            .write.format("com.databricks.spark.csv")\
            .option("header", "true").save(out)

    ctx.stop()
Пример #2
0
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = SQLContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    tables.update(phedex_tables(sqlContext, verbose=verbose))
    phedex_df = tables['phedex_df']
    ddf = tables['ddf']
    fdf = tables['fdf']

    print("### ddf from main", ddf)

    # join tables
    cols = ['*']  # to select all fields from table
    cols = ['d_dataset_id', 'd_dataset', 'f_logical_file_name', 'f_adler32']

    # join tables
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # construct conditions
    adler = [
        'ad8f6ad2', '9c441343', 'f68d5dca', '81c90e2a', '471d2524', 'a3c1f077',
        '6f0018a0', '8bb03b60', 'd504882c', '5ede357f', 'b05303c3', '716d1776',
        '7e9cf258', '1945804b', 'ec7bc1d7', '12c87747', '94f2aa32'
    ]
    cond = 'f_adler32 in %s' % adler
    cond = cond.replace('[', '(').replace(']', ')')
    #    scols = ['f_logical_file_name']
    fjoin = joins.where(cond).distinct().select(cols)

    print_rows(fjoin, stmt, verbose)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        fjoin.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Пример #3
0
def run(fout, date, yarn=None, verbose=None, inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)

    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate

    # read Phedex and DBS tables
    tables = {}

    tables.update(
        phedex_tables(sqlContext,
                      verbose=verbose,
                      fromdate=fromdate,
                      todate=todate))
    phedex = tables['phedex_df']

    instances = [inst]
    if inst == 'all':
        instances = ['GLOBAL', 'PHYS01', 'PHYS02', 'PHYS03']

    for instance in instances:
        dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose)
        for key, val in dbs_dict.items():
            new_key = '%s_%s' % (key, instance)
            tables[new_key] = val

    daf = reduce(lambda a, b: a.unionAll(b),
                 [tables['daf_%s' % x] for x in instances])
    ddf = reduce(lambda a, b: a.unionAll(b),
                 [tables['ddf_%s' % x] for x in instances])
    fdf = reduce(lambda a, b: a.unionAll(b),
                 [tables['fdf_%s' % x] for x in instances])

    dbs_fdf_cols = ['f_dataset_id', 'f_file_size']
    dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id']
    dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type']

    fdf_df = fdf.select(dbs_fdf_cols)
    ddf_df = ddf.select(dbs_ddf_cols)
    daf_df = daf.select(dbs_daf_cols)

    # Aggregate by campaign and find total PhEDEx and DBS size of each campaign

    # d_dataset_id, d_dataset, dataset_access_type
    dbs_df = ddf_df.join(daf_df, ddf_df.d_dataset_access_type_id == daf_df.dataset_access_type_id)\
                   .drop(ddf_df.d_dataset_access_type_id)\
                   .drop(daf_df.dataset_access_type_id)

    # dataset, dbs_size
    dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\
                   .join(fdf_df, dbs_df.d_dataset_id == fdf_df.f_dataset_id)\
                   .withColumnRenamed('d_dataset', 'dataset')\
                   .withColumnRenamed('f_file_size', 'dbs_size')\
                   .drop(dbs_df.d_dataset_id)\
                   .drop(fdf_df.f_dataset_id)\
                   .drop(dbs_df.dataset_access_type)

    # dataset, dbs_size
    dbs_df = dbs_df.groupBy(['dataset'])\
                   .agg({'dbs_size':'sum'})\
                   .withColumnRenamed('sum(dbs_size)', 'dbs_size')

    # dataset, site, phedex_size
    phedex_cols = ['dataset_name', 'block_bytes', 'node_name']
    phedex_df = phedex.select(phedex_cols)
    phedex_df = phedex_df.withColumnRenamed('block_bytes', 'phedex_size')\
                         .withColumnRenamed('dataset_name', 'dataset')\
                         .withColumnRenamed('node_name', 'site')

    # dataset, sites, phedex_size
    phedex_df = phedex_df.groupBy(['dataset'])\
                   .agg({'phedex_size':'sum', 'site': 'collect_set'})\
                   .withColumnRenamed('sum(phedex_size)', 'phedex_size')\
                   .withColumnRenamed('collect_set(site)', 'sites')

    # Subtract to get leftovers

    extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2])

    # dataset
    leftover_datasets_df = phedex_df.select('dataset').subtract(
        dbs_df.select('dataset'))

    # dataset, campaign, sites, phedex_size
    leftovers_df = leftover_datasets_df.select('dataset').join(
        phedex_df, 'dataset')  #.join(dbs_df, 'dataset')
    leftovers_df = leftovers_df.withColumn('campaign', extract_campaign_udf(leftovers_df.dataset))\
                               .select(['dataset', 'campaign', 'sites', 'phedex_size'])

    # Subtract to get leftovers that don't even exist in DBS (orphans)

    ddf_datasets_df = ddf_df.withColumnRenamed('d_dataset',
                                               'dataset').select('dataset')
    leftover_datasets_df = phedex_df.select('dataset').subtract(
        ddf_datasets_df)

    # dataset, campaign, sites, phedex_size
    leftovers_orphans_df = leftover_datasets_df.select('dataset').join(
        phedex_df, 'dataset')  #.join(dbs_df, 'dataset')
    leftovers_orphans_df = leftovers_orphans_df.withColumn('campaign', extract_campaign_udf(leftovers_orphans_df.dataset))\
                                               .select(['dataset', 'campaign', 'sites', 'phedex_size'])

    # Sum total size of leftovers
    all_leftovers_size = leftovers_df.select(
        'phedex_size').groupBy().sum().rdd.map(lambda x: x[0]).collect()[0]
    orphan_leftovers_size = leftovers_orphans_df.select(
        'phedex_size').groupBy().sum().rdd.map(lambda x: x[0]).collect()[0]

    print 'All leftovers PhEDEx size: %s' % bytes_to_readable(
        all_leftovers_size)
    print 'Orphan leftovers PhEDEx size: %s' % bytes_to_readable(
        orphan_leftovers_size)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        leftovers_df.write.format("com.databricks.spark.csv")\
                          .option("header", "true").save('%s/all' % fout)

        leftovers_orphans_df.write.format("com.databricks.spark.csv")\
                            .option("header", "true").save('%s/orphans' % fout)

    ctx.stop()
Пример #4
0
def run(fout, date, yarn=None, verbose=None, inst='GLOBAL', limit=100):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)
    
    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate

    # read Phedex and DBS tables
    tables = {}

    tables.update(phedex_tables(sqlContext, verbose=verbose, fromdate=fromdate, todate=todate))
    phedex = tables['phedex_df']

    instances = ['GLOBAL'] # , 'PHYS01', 'PHYS02', 'PHYS03'
    for instance in instances:
        dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose)
        for key, val in dbs_dict.items():
            new_key = '%s_%s' % (key, instance)
            tables[new_key] = val
    
    daf = reduce(lambda a,b: a.unionAll(b), [tables['daf_%s' % x] for x in instances])
    ddf = reduce(lambda a,b: a.unionAll(b), [tables['ddf_%s' % x] for x in instances])
    fdf = reduce(lambda a,b: a.unionAll(b), [tables['fdf_%s' % x] for x in instances])

    dbs_fdf_cols = ['f_dataset_id', 'f_file_size']
    dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id']
    dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type']

    fdf_df = fdf.select(dbs_fdf_cols)
    ddf_df = ddf.select(dbs_ddf_cols)
    daf_df = daf.select(dbs_daf_cols)

    # d_dataset_id, d_dataset, dataset_access_type
    dbs_df = ddf_df.join(daf_df, ddf_df.d_dataset_access_type_id == daf_df.dataset_access_type_id)\
                   .drop(ddf_df.d_dataset_access_type_id)\
                   .drop(daf_df.dataset_access_type_id)

    # dataset, dbs_size
    dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\
                   .join(fdf_df, dbs_df.d_dataset_id == fdf_df.f_dataset_id)\
                   .withColumnRenamed('d_dataset', 'dataset')\
                   .withColumnRenamed('f_file_size', 'dbs_size')\
                   .drop(dbs_df.d_dataset_id)\
                   .drop(fdf_df.f_dataset_id)\
                   .drop(dbs_df.dataset_access_type)
    
    # dataset, dbs_size
    dbs_df = dbs_df.groupBy(['dataset'])\
                   .agg({'dbs_size':'sum'})\
                   .withColumnRenamed('sum(dbs_size)', 'dbs_size')

    # dataset_name, block_bytes, node_name
    phedex_cols = ['dataset_name', 'block_bytes', 'node_name']
    phedex_all_df = phedex.select(phedex_cols)
    
    aggregate(sqlContext, fout, phedex_all_df, dbs_df, limit)
    aggregate(sqlContext, fout, phedex_all_df, dbs_df, limit, disk_only=True)

    ctx.stop()
Пример #5
0
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    tables.update(phedex_tables(sqlContext, verbose=verbose))
    phedex_df = tables['phedex_df']
    daf = tables['daf']
    ddf = tables['ddf']
    bdf = tables['bdf']
    fdf = tables['fdf']
    aef = tables['aef']
    pef = tables['pef']
    mcf = tables['mcf']
    ocf = tables['ocf']
    rvf = tables['rvf']

    print("### ddf from main", ddf)

    # aggregate phedex info into dataframe
    phedex_cols = [
        'node_name', 'dataset_name', 'dataset_is_open', 'block_bytes',
        'replica_time_create'
    ]
    newpdf = phedex_df.select(phedex_cols).groupBy(['node_name', 'dataset_name', 'dataset_is_open'])\
            .agg({'block_bytes':'sum', 'replica_time_create':'max'})\
            .withColumnRenamed('sum(block_bytes)', 'pbr_size')\
            .withColumnRenamed('max(replica_time_create)', 'max_replica_time')
    newpdf.registerTempTable('newpdf')
    #    print_rows(newpdf, 'newpdf', verbose)
    #    newpdf.persist(StorageLevel.MEMORY_AND_DISK)

    # join tables
    cols = ['*']  # to select all fields from table
    cols = [
        'd_dataset_id', 'd_dataset', 'd_creation_date', 'd_is_dataset_valid',
        'f_event_count', 'f_file_size', 'dataset_access_type',
        'acquisition_era_name', 'processing_version'
    ]

    # join tables
    stmt = 'SELECT %s FROM ddf JOIN fdf on ddf.d_dataset_id = fdf.f_dataset_id JOIN daf ON ddf.d_dataset_access_type_id = daf.dataset_access_type_id JOIN aef ON ddf.d_acquisition_era_id = aef.acquisition_era_id JOIN pef ON ddf.d_processing_era_id = pef.processing_era_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)
    #    print_rows(joins, 'joins', verbose)

    # keep joins table around
    #    joins.persist(StorageLevel.MEMORY_AND_DISK)

    # construct conditions
    cond = 'dataset_access_type = "VALID" AND d_is_dataset_valid = 1'
    fjoin = joins.where(cond).distinct().select(cols)

    # at this step we have fjoin table with Row(d_dataset_id=9413359, d_dataset=u'/SingleMu/CMSSW_7_1_0_pre9-GR_R_71_V4_RelVal_mu2012D_TEST-v6000/DQM', d_creation_date=1406060166.0, d_is_dataset_valid=1, f_event_count=5318, f_file_size=21132638.0, dataset_access_type=u'DELETED', acquisition_era_name=u'CMSSW_7_1_0_pre9', processing_version=u'6000'))

    newdf = fjoin\
            .groupBy(['d_dataset','d_dataset_id','dataset_access_type','acquisition_era_name','processing_version'])\
            .agg({'f_event_count':'sum', 'f_file_size':'sum', 'd_creation_date':'max'})\
            .withColumnRenamed('sum(f_event_count)', 'evts')\
            .withColumnRenamed('sum(f_file_size)', 'size')\
            .withColumnRenamed('max(d_creation_date)', 'date')

    # at this point we have ndf dataframe with our collected stats for every dataset
    # let's join it with release info
    newdf.registerTempTable('newdf')
    cols = [
        'd_dataset_id', 'd_dataset', 'evts', 'size', 'date',
        'dataset_access_type', 'acquisition_era_name', 'processing_version',
        'r_release_version'
    ]
    stmt = 'SELECT %s FROM newdf JOIN mcf ON newdf.d_dataset_id = mcf.mc_dataset_id JOIN ocf ON mcf.mc_output_mod_config_id = ocf.oc_output_mod_config_id JOIN rvf ON ocf.oc_release_version_id = rvf.r_release_version_id' % ','.join(
        cols)
    agg_dbs_df = sqlContext.sql(stmt)
    agg_dbs_df.registerTempTable('agg_dbs_df')
    #    print_rows(agg_dbs_df, 'agg_dbs_df', verbose)

    # keep agg_dbs_df table around
    #    agg_dbs_df.persist(StorageLevel.MEMORY_AND_DISK)

    # join dbs and phedex tables
    #    cols = ['d_dataset_id','d_dataset','evts','size','date','dataset_access_type','acquisition_era_name','processing_version','r_release_version','dataset_name','node_name','pbr_size','dataset_is_open','max_replica_time']
    cols = [
        'd_dataset', 'evts', 'size', 'date', 'dataset_access_type',
        'acquisition_era_name', 'r_release_version', 'node_name', 'pbr_size',
        'dataset_is_open', 'max_replica_time'
    ]
    stmt = 'SELECT %s FROM agg_dbs_df JOIN newpdf ON agg_dbs_df.d_dataset = newpdf.dataset_name' % ','.join(
        cols)
    finaldf = sqlContext.sql(stmt)

    # keep agg_dbs_df table around
    #    finaldf.persist(StorageLevel.MEMORY_AND_DISK)
    #    print_rows(finaldf, stmt, verbose)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        ndf = split_dataset(finaldf, 'd_dataset')
        ndf.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Пример #6
0
def run(fout,
        date,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)

    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate

    # read Phedex and DBS tables
    tables = {}

    tables.update(
        phedex_tables(sqlContext,
                      verbose=verbose,
                      fromdate=fromdate,
                      todate=todate))
    phedex = tables['phedex_df']

    instances = ['GLOBAL', 'PHYS01', 'PHYS02', 'PHYS03']
    for instance in instances:
        dbs_dict = dbs_tables(sqlContext, inst=instance, verbose=verbose)
        for key, val in dbs_dict.items():
            new_key = '%s_%s' % (key, instance)
            tables[new_key] = val

    daf = reduce(lambda a, b: a.unionAll(b),
                 [tables['daf_%s' % x] for x in instances])
    ddf = reduce(lambda a, b: a.unionAll(b),
                 [tables['ddf_%s' % x] for x in instances])
    fdf = reduce(lambda a, b: a.unionAll(b),
                 [tables['fdf_%s' % x] for x in instances])

    dbs_fdf_cols = ['f_dataset_id', 'f_file_size']
    dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id']
    dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type']

    fdf_df = fdf.select(dbs_fdf_cols)
    ddf_df = ddf.select(dbs_ddf_cols)
    daf_df = daf.select(dbs_daf_cols)

    # Aggregate by campaign and find total PhEDEx and DBS size of each campaign

    extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2])

    # dataset, size, dataset_access_type_id
    dbs_df = fdf_df.join(ddf_df, fdf_df.f_dataset_id == ddf_df.d_dataset_id)\
                   .drop('f_dataset_id')\
                   .drop('d_dataset_id')\
                   .withColumnRenamed('d_dataset', 'dataset')\
                   .withColumnRenamed('f_file_size', 'size')\
                   .withColumnRenamed('d_dataset_access_type_id', 'dataset_access_type_id')

    # dataset, size, dataset_access_type
    dbs_df = dbs_df.join(daf_df, dbs_df.dataset_access_type_id == daf_df.dataset_access_type_id)\
                   .drop(dbs_df.dataset_access_type_id)\
                   .drop(daf_df.dataset_access_type_id)

    # campaign, dbs_size
    dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\
                   .withColumn('campaign', extract_campaign_udf(dbs_df.dataset))\
                   .groupBy(['campaign'])\
                   .agg({'size':'sum'})\
                   .withColumnRenamed('sum(size)', 'dbs_size')\
                   .drop('dataset')

    # campaign, phedex_size
    phedex_cols = ['dataset_name', 'block_bytes']
    phedex_df = phedex.select(phedex_cols)
    phedex_df = phedex_df.withColumn('campaign', extract_campaign_udf(phedex_df.dataset_name))\
                .groupBy(['campaign'])\
                .agg({'block_bytes':'sum'})\
                .withColumnRenamed('sum(block_bytes)', 'phedex_size')

    # campaign, dbs_size, phedex_size
    dbs_phedex_df = dbs_df.join(phedex_df, dbs_df.campaign == phedex_df.campaign)\
                          .drop(dbs_df.campaign)

    print 'DISTINCT DBS AND PHEDEX CAMPAIGN COUNT:'
    print dbs_phedex_df.select('campaign').distinct().count()

    # Select campaign - site pairs and their sizes (from PhEDEx)

    # campaign, site, size
    phedex_cols = ['dataset_name', 'node_name', 'block_bytes']
    campaign_site_df = phedex.select(phedex_cols)
    campaign_site_df = campaign_site_df.withColumn('campaign', extract_campaign_udf(campaign_site_df.dataset_name))\
                .groupBy(['campaign', 'node_name'])\
                .agg({'block_bytes':'sum'})\
                .withColumnRenamed('sum(block_bytes)', 'size')\
                .withColumnRenamed('node_name', 'site')

    # Aggregate data for site - campaign count table

    # site, count
    site_campaign_count_df = campaign_site_df.groupBy(['site'])\
                                             .agg(countDistinct('campaign'))\
                                             .withColumnRenamed('count(campaign)', 'campaign_count')\
                                             .orderBy('campaign_count', ascending=False)\
                                             .limit(LIMIT)

    # Find two most significant sites for each campaign

    columns_before_pivot = campaign_site_df.columns

    result = campaign_site_df.groupBy(['campaign'])\
                             .pivot('site')\
                             .sum('size')\
                             .na.fill(0)

    columns_after_pivot = result.columns
    sites_columns = [
        x for x in columns_after_pivot if x not in columns_before_pivot
    ]

    number_of_sites_udf = udf(lambda row: len([x for x in row if x != 0]),
                              IntegerType())
    mss_udf = udf(get_mss, LongType())
    second_mss_udf = udf(get_second_mss, LongType())
    mss_name_udf = udf(lambda row: get_mss_name(row, sites_columns),
                       StringType())
    second_mss_name_udf = udf(
        lambda row: get_second_mss_name(row, sites_columns), StringType())

    result = result.withColumn('sites', number_of_sites_udf(struct([result[x] for x in sites_columns])))\
                   .withColumn('mss', mss_udf(struct([result[x] for x in sites_columns])))\
                   .withColumn('mss_name', mss_name_udf(struct([result[x] for x in sites_columns])))\
                   .withColumn('second_mss', second_mss_udf(struct([result[x] for x in sites_columns])))\
                   .withColumn('second_mss_name', second_mss_name_udf(struct([result[x] for x in sites_columns])))

    # campaign, phedex_size, dbs_size, mss, mss_name, second_mss, second_mss_name, sites
    result = result.join(dbs_phedex_df, result.campaign == dbs_phedex_df.campaign)\
                   .drop(result.campaign)

    sorted_by_phedex = result.orderBy(result.phedex_size,
                                      ascending=False).limit(LIMIT)
    sorted_by_dbs = result.orderBy(result.dbs_size,
                                   ascending=False).limit(LIMIT)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        sorted_by_phedex.write.format("com.databricks.spark.csv")\
                              .option("header", "true").save('%s/phedex' % fout)

        sorted_by_dbs.write.format("com.databricks.spark.csv")\
                           .option("header", "true").save('%s/dbs' % fout)

        site_campaign_count_df.write.format("com.databricks.spark.csv")\
                              .option("header", "true").save('%s/site_campaign_count' % fout)

    ctx.stop()
Пример #7
0
def main():
    "Main function"
    optmgr = OptionParser()
    opts = optmgr.parser.parse_args()

    print("Input arguments: %s" % opts)

    start_time = time.time()
    verbose = opts.verbose
    yarn = opts.yarn
    inst = opts.inst
    date = opts.date
    fout = opts.fout
    aaa_hdir = opts.aaa_hdir

    if inst.lower() in ['global', 'phys01', 'phys02', 'phys03']:
        inst = inst.upper()
    else:
        raise Exception('Unsupported DBS instance "%s"' % inst)

    # Create spark context
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    # Create SQL context to be used for SQL queries
    sql_context = SQLContext(ctx)

    # Initialize DBS tables
    dbs_tables(sql_context,
               inst=inst,
               verbose=verbose,
               tables=['fdf', 'bdf', 'ddf'])

    # Initialize PhEDEx table to be used in file_block_site table
    phedex_tables(sql_context, verbose=verbose)

    # Register clean_site_name to be used with SQL queries
    sql_context.udf.register("clean_site_name", clean_site_name)

    # Register tier_from_site_name to be used with SQL queries
    sql_context.udf.register("tier_from_site_name", tier_from_site_name)

    # Register dn2uuid to be used with SQL queries
    sql_context.udf.register("dn2uuid", dn2uuid)

    # Register parse_app to be used with SQL queries
    sql_context.udf.register("parse_app", parse_app)

    # Register stream4app to be used with SQL queries
    sql_context.udf.register("stream4app", stream4app)

    # Register parse_dn to be used with SQL queries
    sql_context.udf.register("parse_dn", parse_dn)

    f_b_s_start_time = time.time()
    # Create temp table with file name, block name, site name and site from PhEDEx
    create_file_block_site_table(ctx, sql_context, verbose)
    f_b_s_elapsed_time = elapsed_time(f_b_s_start_time)

    cmssw_start_time = time.time()
    aggregated_cmssw_df = run_agg_cmssw(date, ctx, sql_context, verbose)
    cmssw_elapsed_time = elapsed_time(cmssw_start_time)

    aaa_start_time = time.time()
    if len(aaa_hdir) > 0:
        aggregated_aaa_df = run_agg_aaa(date, ctx, sql_context, aaa_hdir,
                                        verbose)
    else:
        aggregated_aaa_df = run_agg_aaa(date,
                                        ctx,
                                        sql_context,
                                        verbose=verbose)

    aaa_elapsed_time = elapsed_time(aaa_start_time)

    eos_start_time = time.time()
    aggregated_eos_df = run_agg_eos(date, ctx, sql_context, verbose)
    eos_elapsed_time = elapsed_time(eos_start_time)

    jm_start_time = time.time()
    aggregated_jm_df = run_agg_jm(date, ctx, sql_context, verbose)
    jm_elapsed_time = elapsed_time(jm_start_time)

    if verbose:
        print('Will union outputs from all streams to a single dataframe')
    # Schema for output is:
    # site name, dataset name, number of accesses, distinct users, stream
    all_df = aggregated_cmssw_df.unionAll(aggregated_aaa_df)
    all_df = all_df.unionAll(aggregated_eos_df)
    all_df = all_df.unionAll(aggregated_jm_df)
    all_df = all_df.sort(desc("nacc"))

    if verbose:
        print('Done joining all outputs to a single dataframe')

    fout = fout + "/" + short_date_string(date)

    # output_dataframe(fout + "/Aggregated/CMSSW/" + short_date_string(date), aggregated_cmssw_df, verbose)
    # output_dataframe(fout + "/Aggregated/AAA/" + short_date_string(date), aggregated_aaa_df, verbose)
    # output_dataframe(fout + "/Aggregated/EOS/" + short_date_string(date), aggregated_eos_df, verbose)
    # output_dataframe(fout + "/Aggregated/JobMonitoring/" + short_date_string(date), aggregated_jm_df, verbose)

    output_dataframe(fout, all_df, verbose)

    if verbose:
        cmssw_df_size = aggregated_cmssw_df.count()
        aaa_df_size = aggregated_aaa_df.count()
        eos_df_size = aggregated_eos_df.count()
        jm_df_size = aggregated_jm_df.count()
        all_df_size = all_df.count()

        print('CMSSW:')
        aggregated_cmssw_df.show(10)
        aggregated_cmssw_df.printSchema()

        print('AAA:')
        aggregated_aaa_df.show(10)
        aggregated_aaa_df.printSchema()

        print('EOS:')
        aggregated_eos_df.show(10)
        aggregated_eos_df.printSchema()

        print('JobMonitoring:')
        aggregated_jm_df.show(10)
        aggregated_jm_df.printSchema()

        print('Aggregated all:')
        all_df.show(10)
        all_df.printSchema()

        print('Output record count:')
        print('Output record count CMSSW         : %s' % cmssw_df_size)
        print('Output record count AAA           : %s' % aaa_df_size)
        print('Output record count EOS           : %s' % eos_df_size)
        print('Output record count JobMonitoring : %s' % jm_df_size)
        print('Output record count Total:        : %s' % all_df_size)

    ctx.stop()

    print('Start time         : %s' %
          time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(start_time)))
    print('End time           : %s' %
          time.strftime('%Y-%m-%d %H:%M:%S GMT', time.gmtime(time.time())))
    print('Total elapsed time : %s' % elapsed_time(start_time))

    print('FileBlockSite elapsed time : %s' % f_b_s_elapsed_time)
    print('AAA elapsed time           : %s' % aaa_elapsed_time)
    print('CMSSW elapsed time         : %s' % cmssw_elapsed_time)
    print('EOS elapsed time           : %s' % eos_elapsed_time)
    print('JobMonitoring elapsed time : %s' % jm_elapsed_time)
Пример #8
0
def run(fout,
        date,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """

    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)

    quiet_logs(ctx)

    sqlContext = HiveContext(ctx)

    fromdate = '%s-%s-%s' % (date[:4], date[4:6], date[6:])
    todate = fromdate

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    tables.update(
        phedex_tables(sqlContext,
                      verbose=verbose,
                      fromdate=fromdate,
                      todate=todate))
    phedex = tables['phedex_df']

    daf = tables['daf']
    ddf = tables['ddf']
    fdf = tables['fdf']

    # DBS
    dbs_fdf_cols = ['f_dataset_id', 'f_file_size']
    dbs_ddf_cols = ['d_dataset_id', 'd_dataset', 'd_dataset_access_type_id']
    dbs_daf_cols = ['dataset_access_type_id', 'dataset_access_type']

    fdf_df = fdf.select(dbs_fdf_cols)
    ddf_df = ddf.select(dbs_ddf_cols)
    daf_df = daf.select(dbs_daf_cols)

    # dataset, dbs_size, dataset_access_type_id
    dbs_df = fdf_df.join(ddf_df, fdf_df.f_dataset_id == ddf_df.d_dataset_id)\
                   .drop('f_dataset_id')\
                   .drop('d_dataset_id')\
                   .withColumnRenamed('d_dataset', 'dataset')\
                   .withColumnRenamed('f_file_size', 'size')\
                   .withColumnRenamed('d_dataset_access_type_id', 'dataset_access_type_id')

    # dataset, size, dataset_access_type
    dbs_df = dbs_df.join(daf_df, dbs_df.dataset_access_type_id == daf_df.dataset_access_type_id)\
                   .drop(dbs_df.dataset_access_type_id)\
                   .drop(daf_df.dataset_access_type_id)

    # dataset, dbs_size
    dbs_df = dbs_df.where(dbs_df.dataset_access_type == 'VALID')\
                   .groupBy('dataset')\
                   .agg({'size':'sum'})\
                   .withColumnRenamed('sum(size)', 'dbs_size')

    # PhEDEx

    size_on_disk_udf = udf(lambda site, size: 0 if site.endswith(
        ('_MSS', '_Buffer', '_Export')) else size)

    # dataset, size, site
    phedex_cols = ['dataset_name', 'block_bytes', 'node_name']
    phedex_df = phedex.select(phedex_cols)\
                      .withColumnRenamed('dataset_name', 'dataset')\
                      .withColumnRenamed('block_bytes', 'size')\
                      .withColumnRenamed('node_name', 'site')

    # dataset, phedex_size, size_on_disk
    phedex_df = phedex_df.withColumn('size_on_disk', size_on_disk_udf(phedex_df.site, phedex_df.size))\
                         .groupBy('dataset')\
                         .agg({'size':'sum', 'size_on_disk': 'sum'})\
                         .withColumnRenamed('sum(size)', 'phedex_size')\
                         .withColumnRenamed('sum(size_on_disk)', 'size_on_disk')

    # dataset, dbs_size, phedex_size, size_on_disk
    result = phedex_df.join(dbs_df, phedex_df.dataset == dbs_df.dataset)\
                      .drop(dbs_df.dataset)

    extract_campaign_udf = udf(lambda dataset: dataset.split('/')[2])
    extract_tier_udf = udf(lambda dataset: dataset.split('/')[3])

    # campaign, tier, dbs_size, phedex_size, size_on_disk
    result = result.withColumn('campaign', extract_campaign_udf(result.dataset))\
                   .withColumn('tier', extract_tier_udf(result.dataset))\
                   .drop('dataset')\
                   .groupBy(['campaign', 'tier'])\
                   .agg({'dbs_size':'sum', 'phedex_size': 'sum', 'size_on_disk': 'sum'})\
                   .withColumnRenamed('sum(dbs_size)', 'dbs_size')\
                   .withColumnRenamed('sum(phedex_size)', 'phedex_size')\
                   .withColumnRenamed('sum(size_on_disk)', 'size_on_disk')

    # campaign, tier, dbs_size, phedex_size, size_on_disk
    result = result.withColumn('sum_size',
                               result.dbs_size + result.phedex_size)
    result = result.orderBy(result.sum_size, ascending=False)\
                   .drop('sum_size')\
                   .limit(LIMIT)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        result.write.format("com.databricks.spark.csv")\
                    .option("header", "true").save(fout)

    ctx.stop()