def get_df_dbs_f_d(spark):
    """Create a dataframe for FILE-DATASET membership/ownership map

    Columns selected: f_name, d_name
    """
    csvreader = spark.read.format('csv') \
        .option('nullValue', 'null') \
        .option('mode', 'FAILFAST')
    dbs_files = csvreader.schema(cms_schemas.schema_files()) \
        .load(HDFS_DBS_FILES) \
        .withColumnRenamed('f_logical_file_name', 'f_name')
    dbs_datasets = csvreader.schema(cms_schemas.schema_datasets()) \
        .load(HDFS_DBS_DATASETS) \
        .select(['d_dataset_id', 'd_dataset'])
    df_dbs_f_d = dbs_files.join(dbs_datasets, dbs_files.f_dataset_id == dbs_datasets.d_dataset_id, how='left') \
        .withColumnRenamed('f_dataset_id', 'dataset_id') \
        .withColumnRenamed('d_dataset', 'dataset') \
        .select(['f_name', 'dataset'])
    # (df_dbs_f_d.select('f_name').distinct().count() == df_dbs_f_d.count()) => True

    return df_dbs_f_d
Пример #2
0
def dbs_tables(sqlContext, hdir='hdfs:///project/awg/cms', inst='GLOBAL', verbose=False):
    """
    Parse DBS records on HDFS via mapping DBS tables to Spark SQLContext.
    :returns: a dictionary with DBS Spark DataFrame.
    """
    dbsdir = hdir+'/CMS_DBS3_PROD_%s/current' % inst
    paths = {'dpath':apath(dbsdir, 'DATASETS'),
             'bpath':apath(dbsdir, 'BLOCKS'),
             'fpath':apath(dbsdir, 'FILES'),
             'apath':apath(dbsdir, 'ACQUISITION_ERAS'),
             'ppath':apath(dbsdir, 'PROCESSING_ERAS'),
             'mcpath':apath(dbsdir, 'DATASET_OUTPUT_MOD_CONFIGS'),
             'ocpath':apath(dbsdir, 'OUTPUT_MODULE_CONFIGS'),
             'rvpath':apath(dbsdir, 'RELEASE_VERSIONS'),
             'flpath':apath(dbsdir, 'FILE_LUMIS'),
             'dapath':apath(dbsdir, 'DATASET_ACCESS_TYPES')}
    print("Use the following data on HDFS")
    for key, val in paths.items():
        print(val)

    # define DBS tables
    daf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_dataset_access_types()) \
                        for path in files(paths['dapath'], verbose)])
    ddf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_datasets()) \
                        for path in files(paths['dpath'], verbose)])
    bdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_blocks()) \
                        for path in files(paths['bpath'], verbose)])
    fdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_files()) \
                        for path in files(paths['fpath'], verbose)])
    aef = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_acquisition_eras()) \
                        for path in files(paths['apath'], verbose)])
    pef = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_processing_eras()) \
                        for path in files(paths['ppath'], verbose)])

    mcf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_mod_configs()) \
                        for path in files(paths['mcpath'], verbose)])
    ocf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_out_configs()) \
                        for path in files(paths['ocpath'], verbose)])
    rvf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_rel_versions()) \
                        for path in files(paths['rvpath'], verbose)])
    flf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                        .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                        .load(path, schema = schema_file_lumis()) \
                        for path in files(paths['flpath'], verbose)])

    # Register temporary tables to be able to use sqlContext.sql
    daf.registerTempTable('daf')
    ddf.registerTempTable('ddf')
    bdf.registerTempTable('bdf')
    fdf.registerTempTable('fdf')
    aef.registerTempTable('aef')
    pef.registerTempTable('pef')
    mcf.registerTempTable('mcf')
    ocf.registerTempTable('ocf')
    rvf.registerTempTable('rvf')
    flf.registerTempTable('flf')

    tables = {'daf':daf, 'ddf':ddf, 'bdf':bdf, 'fdf':fdf, 'aef':aef, 'pef':pef, 'mcf':mcf, 'ocf':ocf, 'rvf':rvf, 'flf': flf}
    return tables
Пример #3
0
def dbs_tables(sqlContext,
               hdir='hdfs:///project/awg/cms',
               inst='GLOBAL',
               verbose=False,
               tables=None):
    """
    Parse DBS records on HDFS via mapping DBS tables to Spark SQLContext.
    :returns: a dictionary with DBS Spark DataFrame.
    """
    dbsdir = hdir + '/CMS_DBS3_PROD_%s/current' % inst
    paths = {
        'dpath': apath(dbsdir, 'DATASETS'),
        'tpath': apath(dbsdir, 'DATA_TIERS'),
        'bpath': apath(dbsdir, 'BLOCKS'),
        'fpath': apath(dbsdir, 'FILES'),
        'apath': apath(dbsdir, 'ACQUISITION_ERAS'),
        'ppath': apath(dbsdir, 'PROCESSING_ERAS'),
        'mcpath': apath(dbsdir, 'DATASET_OUTPUT_MOD_CONFIGS'),
        'ocpath': apath(dbsdir, 'OUTPUT_MODULE_CONFIGS'),
        'rvpath': apath(dbsdir, 'RELEASE_VERSIONS'),
        'flpath': apath(dbsdir, 'FILE_LUMIS'),
        'dapath': apath(dbsdir, 'DATASET_ACCESS_TYPES')
    }
    logging.info("Use the following data on HDFS")
    dbs_tables = {}  # final dict of tables we'll load
    for key, val in paths.items():
        if tables:
            if key in tables:
                logging.info(val)
        else:
            logging.info(val)

    # define DBS tables
    if not tables or 'daf' in tables:
        daf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_dataset_access_types()) \
                            for path in files(paths['dapath'], verbose)])
        daf.registerTempTable('daf')
        dbs_tables.update({'daf': daf})
    if not tables or 'ddf' in tables:
        ddf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_datasets()) \
                            for path in files(paths['dpath'], verbose)])
        ddf.registerTempTable('ddf')
        dbs_tables.update({'ddf': ddf})
    if not tables or 'dtf' in tables:
        dtf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_data_tiers()) \
                            for path in files(paths['tpath'], verbose)])
        dtf.registerTempTable('dtf')
        dbs_tables.update({'dtf': dtf})
    if not tables or 'bdf' in tables:
        bdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_blocks()) \
                            for path in files(paths['bpath'], verbose)])
        bdf.registerTempTable('bdf')
        dbs_tables.update({'bdf': bdf})
    if not tables or 'fdf' in tables:
        fdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_files()) \
                            for path in files(paths['fpath'], verbose)])
        fdf.registerTempTable('fdf')
        dbs_tables.update({'fdf': fdf})
    if not tables or 'aef' in tables:
        aef = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_acquisition_eras()) \
                            for path in files(paths['apath'], verbose)])
        aef.registerTempTable('aef')
        dbs_tables.update({'aef': aef})
    if not tables or 'pef' in tables:
        pef = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_processing_eras()) \
                            for path in files(paths['ppath'], verbose)])
        pef.registerTempTable('pef')
        dbs_tables.update({'pef': pef})
    if not tables or 'mcf' in tables:
        mcf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_mod_configs()) \
                            for path in files(paths['mcpath'], verbose)])
        mcf.registerTempTable('mcf')
        dbs_tables.update({'mcf': mcf})
    if not tables or 'ocf' in tables:
        ocf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_out_configs()) \
                            for path in files(paths['ocpath'], verbose)])
        ocf.registerTempTable('ocf')
        dbs_tables.update({'ocf': ocf})
    if not tables or 'rvf' in tables:
        rvf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_rel_versions()) \
                            for path in files(paths['rvpath'], verbose)])
        rvf.registerTempTable('rvf')
        dbs_tables.update({'rvf': rvf})
    if not tables or 'flf' in tables:
        flf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\
                            .options(treatEmptyValuesAsNulls='true', nullValue='null')\
                            .load(path, schema = schema_file_lumis()) \
                            for path in files(paths['flpath'], verbose)])
        flf.registerTempTable('flf')
        dbs_tables.update({'flf': flf})

    return dbs_tables