def get_df_dbs_f_d(spark): """Create a dataframe for FILE-DATASET membership/ownership map Columns selected: f_name, d_name """ csvreader = spark.read.format('csv') \ .option('nullValue', 'null') \ .option('mode', 'FAILFAST') dbs_files = csvreader.schema(cms_schemas.schema_files()) \ .load(HDFS_DBS_FILES) \ .withColumnRenamed('f_logical_file_name', 'f_name') dbs_datasets = csvreader.schema(cms_schemas.schema_datasets()) \ .load(HDFS_DBS_DATASETS) \ .select(['d_dataset_id', 'd_dataset']) df_dbs_f_d = dbs_files.join(dbs_datasets, dbs_files.f_dataset_id == dbs_datasets.d_dataset_id, how='left') \ .withColumnRenamed('f_dataset_id', 'dataset_id') \ .withColumnRenamed('d_dataset', 'dataset') \ .select(['f_name', 'dataset']) # (df_dbs_f_d.select('f_name').distinct().count() == df_dbs_f_d.count()) => True return df_dbs_f_d
def dbs_tables(sqlContext, hdir='hdfs:///project/awg/cms', inst='GLOBAL', verbose=False): """ Parse DBS records on HDFS via mapping DBS tables to Spark SQLContext. :returns: a dictionary with DBS Spark DataFrame. """ dbsdir = hdir+'/CMS_DBS3_PROD_%s/current' % inst paths = {'dpath':apath(dbsdir, 'DATASETS'), 'bpath':apath(dbsdir, 'BLOCKS'), 'fpath':apath(dbsdir, 'FILES'), 'apath':apath(dbsdir, 'ACQUISITION_ERAS'), 'ppath':apath(dbsdir, 'PROCESSING_ERAS'), 'mcpath':apath(dbsdir, 'DATASET_OUTPUT_MOD_CONFIGS'), 'ocpath':apath(dbsdir, 'OUTPUT_MODULE_CONFIGS'), 'rvpath':apath(dbsdir, 'RELEASE_VERSIONS'), 'flpath':apath(dbsdir, 'FILE_LUMIS'), 'dapath':apath(dbsdir, 'DATASET_ACCESS_TYPES')} print("Use the following data on HDFS") for key, val in paths.items(): print(val) # define DBS tables daf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_dataset_access_types()) \ for path in files(paths['dapath'], verbose)]) ddf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_datasets()) \ for path in files(paths['dpath'], verbose)]) bdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_blocks()) \ for path in files(paths['bpath'], verbose)]) fdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_files()) \ for path in files(paths['fpath'], verbose)]) aef = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_acquisition_eras()) \ for path in files(paths['apath'], verbose)]) pef = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_processing_eras()) \ for path in files(paths['ppath'], verbose)]) mcf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_mod_configs()) \ for path in files(paths['mcpath'], verbose)]) ocf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_out_configs()) \ for path in files(paths['ocpath'], verbose)]) rvf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_rel_versions()) \ for path in files(paths['rvpath'], verbose)]) flf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_file_lumis()) \ for path in files(paths['flpath'], verbose)]) # Register temporary tables to be able to use sqlContext.sql daf.registerTempTable('daf') ddf.registerTempTable('ddf') bdf.registerTempTable('bdf') fdf.registerTempTable('fdf') aef.registerTempTable('aef') pef.registerTempTable('pef') mcf.registerTempTable('mcf') ocf.registerTempTable('ocf') rvf.registerTempTable('rvf') flf.registerTempTable('flf') tables = {'daf':daf, 'ddf':ddf, 'bdf':bdf, 'fdf':fdf, 'aef':aef, 'pef':pef, 'mcf':mcf, 'ocf':ocf, 'rvf':rvf, 'flf': flf} return tables
def dbs_tables(sqlContext, hdir='hdfs:///project/awg/cms', inst='GLOBAL', verbose=False, tables=None): """ Parse DBS records on HDFS via mapping DBS tables to Spark SQLContext. :returns: a dictionary with DBS Spark DataFrame. """ dbsdir = hdir + '/CMS_DBS3_PROD_%s/current' % inst paths = { 'dpath': apath(dbsdir, 'DATASETS'), 'tpath': apath(dbsdir, 'DATA_TIERS'), 'bpath': apath(dbsdir, 'BLOCKS'), 'fpath': apath(dbsdir, 'FILES'), 'apath': apath(dbsdir, 'ACQUISITION_ERAS'), 'ppath': apath(dbsdir, 'PROCESSING_ERAS'), 'mcpath': apath(dbsdir, 'DATASET_OUTPUT_MOD_CONFIGS'), 'ocpath': apath(dbsdir, 'OUTPUT_MODULE_CONFIGS'), 'rvpath': apath(dbsdir, 'RELEASE_VERSIONS'), 'flpath': apath(dbsdir, 'FILE_LUMIS'), 'dapath': apath(dbsdir, 'DATASET_ACCESS_TYPES') } logging.info("Use the following data on HDFS") dbs_tables = {} # final dict of tables we'll load for key, val in paths.items(): if tables: if key in tables: logging.info(val) else: logging.info(val) # define DBS tables if not tables or 'daf' in tables: daf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_dataset_access_types()) \ for path in files(paths['dapath'], verbose)]) daf.registerTempTable('daf') dbs_tables.update({'daf': daf}) if not tables or 'ddf' in tables: ddf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_datasets()) \ for path in files(paths['dpath'], verbose)]) ddf.registerTempTable('ddf') dbs_tables.update({'ddf': ddf}) if not tables or 'dtf' in tables: dtf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_data_tiers()) \ for path in files(paths['tpath'], verbose)]) dtf.registerTempTable('dtf') dbs_tables.update({'dtf': dtf}) if not tables or 'bdf' in tables: bdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_blocks()) \ for path in files(paths['bpath'], verbose)]) bdf.registerTempTable('bdf') dbs_tables.update({'bdf': bdf}) if not tables or 'fdf' in tables: fdf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_files()) \ for path in files(paths['fpath'], verbose)]) fdf.registerTempTable('fdf') dbs_tables.update({'fdf': fdf}) if not tables or 'aef' in tables: aef = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_acquisition_eras()) \ for path in files(paths['apath'], verbose)]) aef.registerTempTable('aef') dbs_tables.update({'aef': aef}) if not tables or 'pef' in tables: pef = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_processing_eras()) \ for path in files(paths['ppath'], verbose)]) pef.registerTempTable('pef') dbs_tables.update({'pef': pef}) if not tables or 'mcf' in tables: mcf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_mod_configs()) \ for path in files(paths['mcpath'], verbose)]) mcf.registerTempTable('mcf') dbs_tables.update({'mcf': mcf}) if not tables or 'ocf' in tables: ocf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_out_configs()) \ for path in files(paths['ocpath'], verbose)]) ocf.registerTempTable('ocf') dbs_tables.update({'ocf': ocf}) if not tables or 'rvf' in tables: rvf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_rel_versions()) \ for path in files(paths['rvpath'], verbose)]) rvf.registerTempTable('rvf') dbs_tables.update({'rvf': rvf}) if not tables or 'flf' in tables: flf = unionAll([sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(path, schema = schema_file_lumis()) \ for path in files(paths['flpath'], verbose)]) flf.registerTempTable('flf') dbs_tables.update({'flf': flf}) return dbs_tables