def test_esk608(self): """Test Esk-608: Execute Spark histogram filling macro""" # check if required Python and Java libraries are made available to worker nodes sc = ProcessManager().service(SparkManager).get_session().sparkContext self.assertRegexpMatches( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) self.assertRegexpMatches( sc.getConf().get('spark.jars.packages', ''), 'org.diana-hep:histogrammar-sparksql_2.11:1.0.4', 'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail' ) if re.search('spark://', sc.getConf().get('spark.master', '')): py_mods = utils.get_file_path('py_mods') self.assertRegexpMatches( sc.getConf().get('spark.submit.pyFiles', ''), py_mods, 'Eskapade modules missing from spark.submit.pyFiles, needed in Spark cluster mode' ) self.assertRegexpMatches( sc.getConf().get('spark.files', ''), py_mods, 'Eskapade modules missing from spark.files, needed in Spark cluster mode' ) # run Eskapade self.run_eskapade('esk608_spark_histogrammar.py') proc_mgr = ProcessManager() ds = proc_mgr.service(DataStore) settings = ProcessManager().service(ConfigObject) # check data frame self.assertIn('spark_df', ds, 'no object with key "spark_df" in data store') self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame, '"spark_df" is not a Spark data frame') self.assertEqual(ds['spark_df'].count(), 12, 'unexpected number of rows in data frame') self.assertListEqual(sorted(ds['spark_df'].columns), sorted(['date', 'loc', 'x', 'y']), 'unexpected columns in data frame') # data-generation checks self.assertIn('hist', ds) self.assertIsInstance(ds['hist'], dict) col_names = ['date', 'x', 'y', 'loc', 'x:y'] self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names)) # data-summary checks f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y'] file_names = ['report.tex' ] + ['hist_{}.pdf'.format(col) for col in f_bases] for fname in file_names: path = persistence.io_path('results_data', settings.io_conf(), 'report/{}'.format(fname)) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def initialize(self): """Initialize SparkDataToCsv""" # check input arguments self.check_arg_types(allow_none=True, read_key=str, output_path=str, compression_codec=str) self.check_arg_types(mode=str, sep=str, num_files=int) self.check_arg_types(recurse=True, allow_none=True) self.check_arg_vals('read_key', 'sep') self.check_arg_vals('output_path', 'compression_codec', allow_none=True) self.check_arg_opts(mode=('overwrite', 'ignore', 'error')) if self.num_files < 1: raise RuntimeError('requested number of files is less than 1 ({:d})'.format(self.num_files)) # set other attributes self.do_execution = True # set default output path if not self.output_path: settings = ProcessManager().service(ConfigObject) self.output_path = 'file:' + persistence.io_path('results_data', settings.io_conf(), '{}_output'.format(self.name)) # parse header argument try: self.header = tuple(self.header) except TypeError: self.header = bool(self.header) if isinstance(self.header, tuple) and not self.header: raise RuntimeError('empty header sequence specified') # check output directory, if local if self.output_path.startswith('file:'): local_output_path = os.path.abspath(self.output_path.replace('file:','')) if os.path.exists(self.output_path): # output data already exist if self.mode == 'ignore': # do not execute link self.log().debug('Output data already exist; not executing link') self.do_execution = False return StatusCode.Success elif self.mode == 'error': # raise exception raise RuntimeError('output data already exist') # remove output directory if not os.path.isdir(local_output_path): raise RuntimeError('output path "{}" is not a directory'.format(local_output_path)) shutil.rmtree(local_output_path) elif not os.path.exists(os.path.dirname(local_output_path)): # create path up to the last component self.log().debug('Creating output path "%s"', local_output_path) os.makedirs(os.path.dirname(local_output_path)) return StatusCode.Success
def test_esk305(self): settings = ProcessManager().service(ConfigObject) settings['logLevel'] = definitions.LOG_LEVELS['DEBUG'] settings['macro'] = settings[ 'esRoot'] + '/tutorials/esk305_correlation_summary.py' settings['batchMode'] = True status = execution.run_eskapade(settings) self.assertTrue(status.isSuccess()) pm = ProcessManager() settings = ProcessManager().service(ConfigObject) ds = ProcessManager().service(DataStore) # input data checks all_col_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'Unnamed: 5'] self.assertIn('input_data', ds) self.assertIsInstance(ds['input_data'], pd.DataFrame) self.assertListEqual(list(ds['input_data'].columns), all_col_names) self.assertIn('correlations', ds) self.assertIsInstance(ds['correlations'], list) corr_list = ds['correlations'] self.assertEqual(4, len(corr_list)) # correlation matrix checks col_names = ['x1', 'x2', 'x3', 'x4', 'x5'] for corr in corr_list: self.assertIsInstance(corr, pd.DataFrame) #self.assertListEqual(list(corr.columns), col_names) self.assertListEqual(list(corr.index), col_names) # heatmap pdf checks io_conf = settings.io_conf() results_path = persistence.io_path('results_data', io_conf, 'report') correlations = ['pearson', 'kendall', 'spearman', 'correlation_ratio'] for corr in correlations: path = '{0:s}/correlations_input_data_{1:s}.pdf'.format( results_path, corr) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
log.debug('Now parsing configuration file esk209_read_big_data_itr') ######################################################################################### # --- minimal analysis information settings = ProcessManager().service(ConfigObject) settings['analysisName'] = 'esk209_read_big_data_itr' settings['version'] = 0 ######################################################################################### # when chunking through an input file, pick up only N lines in each iteration. chunksize = 5 ######################################################################################### # --- Set path of data data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv') ######################################################################################### # --- now set up the chains and links, based on configuration flags proc_mgr = ProcessManager() # --- example 1: readdata loops over the input files, but no file chunking. if settings.get('do_example1', True): ch = proc_mgr.add_chain('MyChain1') # --- a loop is set up in the chain MyChain. # we iterate over (chunks of) the next file in the list until the iterator is done. # then move on to the next chain (Overview)
######################################################################################### # --- minimal analysis information settings = ProcessManager().service(ConfigObject) settings['analysisName'] = 'esk304_df_boxplot' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. msg = r""" The plots and latex files produced by link df_summary can be found in dir: %s """ % persistence.io_path('results_data', settings.io_conf(), 'report') log.info(msg) COLUMNS = ['var_a', 'var_b', 'var_c'] SIZE = 10000 VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C') VAR_UNITS = dict(var_b='m/s') GEN_CONF = dict(var_a=dict(choice=['alpha', 'beta', 'gamma'], dtype=str), var_b=dict(mean=3., std=1.), var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'], dtype=str)) ######################################################################################### # --- now set up the chains and links based on configuration flags # create process manager proc_mgr = ProcessManager()