def initialize(self): """Initialize the link.""" # perform basic checks of configured attributes # a key and path OR dictionary need to have been set. if self.path and self.key: self.path_map = {self.key: self.path} elif not self.path_map: raise Exception('Path and key OR dictionary not properly set.') # correct the output paths, if need be paths = list(self.path_map.values()) assert '' not in paths, 'One or more of the paths in dict is empty.' assert all([isinstance(p, str) for p in paths ]), 'One or more of the paths in dict is not string.' # update paths if needed for k, p in self.path_map.items(): if not p.__contains__('/'): self.path_map[k] = persistence.io_path('results_data', p) self.logger.debug( 'Output filename for key <{key}> has been reset to {new_key}.', key=k, new_key=self.path_map[k]) self.logger.info('kwargs passed on to pandas writer are: {kwargs}.', kwargs=self.kwargs) return StatusCode.Success
def test_esk409(self): """Test Esk-409: Unredeemed vouchers.""" # run Eskapade macro = resources.tutorial('esk409_unredeemed_vouchers.py') self.eskapade_run(macro) ds = process_manager.service(DataStore) # check generated data self.assertIn('voucher_redeems', ds) self.assertIn('voucher_ages', ds) self.assertIsInstance(ds['voucher_redeems'], ROOT.RooDataSet) self.assertIsInstance(ds['voucher_ages'], ROOT.RooDataSet) self.assertLess(ds['voucher_redeems'].numEntries(), 6000) self.assertGreater(ds['voucher_redeems'].numEntries(), 0) self.assertEqual(ds['voucher_ages'].numEntries(), 10000) # check fit result fit_link = process_manager.get('Fitting').get('Fit') self.assertEqual(fit_link.fit_result.status(), 0) n_ev_pull = (fit_link.results['n_ev'][0] - 6000.) / fit_link.results['n_ev'][1] self.assertGreater(n_ev_pull, -3.) self.assertLess(n_ev_pull, 3.) # check plot output plot_path = persistence.io_path('results_data', 'voucher_redeem.pdf') self.assertTrue(os.path.exists(plot_path)) statinfo = os.stat(plot_path) self.assertGreater(statinfo.st_size, 0)
def test_esk411(self): """Test Esk-411: Predictive maintenance Weibull fit.""" # run Eskapade macro = resources.tutorial('esk411_weibull_predictive_maintenance.py') self.eskapade_run(macro) ds = process_manager.service(DataStore) ws = process_manager.service(RooFitManager).ws # roofit objects check in datastore self.assertIn('fit_result', ds) self.assertIsInstance(ds['fit_result'], ROOT.RooFitResult) # roofit objects check in workspace self.assertIn('binnedData', ds) self.assertIsInstance(ds['binnedData'], ROOT.RooDataHist) mdata = ds['binnedData'] self.assertTrue(mdata) self.assertEqual(300, mdata.numEntries()) mpdf = ws.pdf('sum3pdf') self.assertTrue(mpdf) # successful fit result fit_result = ds['fit_result'] self.assertEqual(0, fit_result.status()) self.assertEqual(3, fit_result.covQual()) n1 = ws.var('N1') self.assertTrue(n1) self.assertGreater(n1.getVal(), 2.e5) n2 = ws.var('N2') self.assertTrue(n2) self.assertGreater(n2.getVal(), 4.e5) n3 = ws.var('N3') self.assertTrue(n3) self.assertGreater(n3.getVal(), 5.e4) # data-summary checks file_names = [ 'weibull_fit_report.tex', 'correlation_matrix_fit_result.pdf', 'floating_pars_fit_result.tex', 'fit_of_time_difference_medium_range.pdf' ] for fname in file_names: path = persistence.io_path('results_data', 'report/{}'.format(fname)) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertGreater(statinfo.st_size, 0)
def test_esk608(self): """Test Esk-608: Execute Spark histogram filling macro.""" # check if required Python and Java libraries are made available to worker nodes sc = process_manager.service(SparkManager).get_session().sparkContext self.assertRegex( sc.getConf().get('spark.master', ''), 'local\[[.*]\]', 'Spark not running in local mode, required for testing with local files' ) self.assertRegex( sc.getConf().get('spark.jars.packages', ''), 'org.diana-hep:histogrammar-sparksql_2.11:1.0.4', 'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail' ) # run Eskapade self.eskapade_run(resources.tutorial('esk608_spark_histogrammar.py')) ds = process_manager.service(DataStore) # check data frame self.assertIn('spark_df', ds, 'no object with key "spark_df" in data store') self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame, '"spark_df" is not a Spark data frame') self.assertEqual(ds['spark_df'].count(), 12, 'unexpected number of rows in data frame') self.assertListEqual(sorted(ds['spark_df'].columns), sorted(['date', 'loc', 'x', 'y']), 'unexpected columns in data frame') # data-generation checks self.assertIn('hist', ds) self.assertIsInstance(ds['hist'], dict) col_names = ['date', 'x', 'y', 'loc', 'x:y'] self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names)) # data-summary checks f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y'] file_names = ['report.tex' ] + ['hist_{}.pdf'.format(col) for col in f_bases] for fname in file_names: path = persistence.io_path('results_data', 'report/{}'.format(fname)) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def test_esk401(self): """Test Esk-401: ROOT hist fill, plot, convert.""" # run Eskapade self.eskapade_run( resources.tutorial('esk401_roothist_fill_plot_convert.py')) ds = process_manager.service(DataStore) # histogram checks self.assertIn('hist', ds) self.assertIsInstance(ds['hist'], dict) columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'x1:x2', 'x2:x3', 'x4:x5'] self.assertListEqual(sorted(ds['hist'].keys()), sorted(columns)) for col in columns: self.assertIsInstance(ds['hist'][col], ROOT.TH1) # data-generation checks self.assertIn('n_correlated_data', ds) self.assertEqual(500, ds['n_correlated_data']) self.assertIn('n_rdh_x1', ds) self.assertEqual(40, ds['n_rdh_x1']) self.assertIn('n_rds_x2_vs_x3', ds) self.assertEqual(23, ds['n_rds_x2_vs_x3']) # roofit objects check self.assertIn('hpdf', ds) self.assertIsInstance(ds['hpdf'], ROOT.RooHistPdf) self.assertIn('rdh_x1', ds) self.assertIsInstance(ds['rdh_x1'], ROOT.RooDataHist) self.assertIn('rds_x2_vs_x3', ds) self.assertIsInstance(ds['rds_x2_vs_x3'], ROOT.RooDataSet) self.assertIn('vars_x2_vs_x3', ds) self.assertIsInstance(ds['vars_x2_vs_x3'], ROOT.RooArgSet) # data-summary checks file_names = ['report.tex'] + [ 'hist_{}.pdf'.format(col.replace(':', '_vs_')) for col in columns ] for fname in file_names: path = persistence.io_path('results_data', 'report/{}'.format(fname)) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def test_esk305(self): settings = process_manager.service(ConfigObject) settings['batchMode'] = True self.eskapade_run(resources.tutorial('esk305_correlation_summary.py')) ds = process_manager.service(DataStore) # input data checks all_col_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'Unnamed: 5'] self.assertIn('input_data', ds) self.assertIsInstance(ds['input_data'], pd.DataFrame) self.assertListEqual(list(ds['input_data'].columns), all_col_names) self.assertIn('correlations', ds) self.assertIsInstance(ds['correlations'], list) corr_list = ds['correlations'] self.assertEqual(5, len(corr_list)) # correlation matrix checks col_names = ['x1', 'x2', 'x3', 'x4', 'x5'] for corr in corr_list: self.assertIsInstance(corr, pd.DataFrame) # self.assertListEqual(list(corr.columns), col_names) self.assertListEqual(list(corr.index), col_names) # heatmap pdf checks results_path = persistence.io_path('results_data', 'report') correlations = ['pearson', 'kendall', 'spearman', 'correlation_ratio', 'phik'] for corr in correlations: path = '{0:s}/correlations_input_data_{1:s}.pdf'.format(results_path, corr) self.assertTrue(os.path.exists(path)) statinfo = os.stat(path) self.assertTrue(statinfo.st_size > 0)
def _process_results_path(self): """Process results_path argument.""" if not self.results_path: self.results_path = persistence.io_path('results_data', 'report') persistence.create_dir(self.results_path)
########################################################################## # --- now set up the chains and links based on configuration flags spark_streaming = Chain('SparkStreaming') # the word count example wordcount_link = SparkStreamingWordCount(name='SparkStreamingWordCount', read_key='dstream', store_key='wordcounts') spark_streaming.add(wordcount_link) # store output writer_link = SparkStreamingWriter( name='SparkStreamingWriter', read_key=wordcount_link.store_key, output_path='file:' + persistence.io_path('results_data', '/dstream/wordcount'), suffix='txt', repartition=1) spark_streaming.add(writer_link) # start/stop of Spark Streaming control_link = SparkStreamingController(name='SparkStreamingController', timeout=10) spark_streaming.add(control_link) ########################################################################## logger.debug('Done parsing configuration file esk610_spark_streaming.')
######################################################################################### # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk304_df_boxplot' settings['version'] = 0 ######################################################################################### # --- Analysis values, settings, helper functions, configuration flags. msg = r""" The plots and latex files produced by link df_summary can be found in dir: {path} """ logger.info(msg, path=persistence.io_path('results_data', 'report')) COLUMNS = ['var_a', 'var_b', 'var_c'] SIZE = 10000 VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C') VAR_UNITS = dict(var_b='m/s') GEN_CONF = dict(var_a=dict(choice=['alpha', 'beta', 'gamma'], dtype=str), var_b=dict(mean=3., std=1.), var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'], dtype=str)) ######################################################################################### # --- now set up the chains and links based on configuration flags # create chains data = Chain('Data')
from escore.logger import Logger, LogLevel from escore.core import persistence logger = Logger() logger.debug('Now parsing configuration file esk111_load_datastore_from_file.') # --- minimal analysis information settings = process_manager.service(ConfigObject) settings['analysisName'] = 'esk111_load_datastore_from_file' settings['version'] = 0 ds = process_manager.service(DataStore) ds['number'] = 1 file_path = persistence.io_path('proc_service_data', 'temp_datastore.pkl') ds.persist_in_file(file_path) # --- update the number ds['number'] = 2 # --- Reload from the pickle file with: # >>> ds = DataStore.import_from_file(file_path) # --- now set up the chains and links ch = Chain('Start') link = core_ops.ImportDataStore(name='importds', path=file_path) link.logger.log_level = LogLevel.DEBUG ch.add(link)
# hand, e.g. csv, xlsx. To use the numpy reader one of the # following should be true: # * reader is {'numpy', 'np', 'npy', 'npz'} # * path contains extensions {'npy', 'npz'} # * param `file_type` is {'npy', 'npz'} # restore_index : bool # whether to store the index in the metadata. Default is # False when the index is numeric, True otherwise. # file_type : str | {'npy', 'npz'} # when using the numpy reader. Optional, see reader for details. nw.add( ReadToDf( name='numpy_reader', key='reloaded_typed_data_np', path=persistence.io_path('results_data', 'tmp_tut_esk210.npy'), )) # The dataframe has now been restored with the dtypes of the original df # and it's index has been restored as it is non-numeric ######################################################################################### # --- Feather reader - writer (R/W) # The primary benefits of the Feather W/R are: # * Interoperability with R dataframes; this package was written # to make sharing data between R and Python much easier # * High performance; relies on the apache Arrow framework # * Data type restoration # * Non-numerical index restoration
def initialize(self): """Initialize the link.""" # check input arguments self.check_arg_types(allow_none=True, read_key=str, output_path=str, compression_codec=str) self.check_arg_types(mode=str, sep=str, num_files=int) self.check_arg_types(recurse=True, allow_none=True) self.check_arg_vals('read_key', 'sep') self.check_arg_vals('output_path', 'compression_codec', allow_none=True) self.check_arg_opts(mode=('overwrite', 'ignore', 'error')) if self.num_files < 1: raise RuntimeError( 'Requested number of files is less than 1 ({:d}).'.format( self.num_files)) # set other attributes self.do_execution = True # set default output path if not self.output_path: self.output_path = persistence.io_path( 'results_data', '{}_output'.format(self.name)) # parse header argument try: self.header = tuple(self.header) except TypeError: self.header = bool(self.header) if isinstance(self.header, tuple) and not self.header: raise RuntimeError('Empty header sequence specified.') # check output directory if self.output_path.startswith('file:/'): output_path = os.path.abspath( self.output_path.replace('file:/', '/')) if os.path.exists(output_path): # output data already exist if self.mode == 'ignore': # do not execute link self.logger.debug( 'Output data already exist; not executing link.') self.do_execution = False return StatusCode.Success elif self.mode == 'error': # raise exception raise RuntimeError('Output data already exist.') # remove output directory if not os.path.isdir(output_path): raise RuntimeError( 'Output path "{}" is not a directory.'.format( output_path)) shutil.rmtree(output_path) elif not os.path.exists(os.path.dirname(output_path)): # create path up to the last component self.logger.debug('Creating output path "{path}".', path=output_path) os.makedirs(os.path.dirname(output_path)) return StatusCode.Success