示例#1
0
    def initialize(self):
        """Initialize the link."""
        # perform basic checks of configured attributes
        # a key and path OR dictionary need to have been set.
        if self.path and self.key:
            self.path_map = {self.key: self.path}
        elif not self.path_map:
            raise Exception('Path and key OR dictionary not properly set.')

        # correct the output paths, if need be
        paths = list(self.path_map.values())
        assert '' not in paths, 'One or more of the paths in dict is empty.'
        assert all([isinstance(p, str) for p in paths
                    ]), 'One or more of the paths in dict is not string.'
        # update paths if needed
        for k, p in self.path_map.items():
            if not p.__contains__('/'):
                self.path_map[k] = persistence.io_path('results_data', p)
                self.logger.debug(
                    'Output filename for key <{key}> has been reset to {new_key}.',
                    key=k,
                    new_key=self.path_map[k])

        self.logger.info('kwargs passed on to pandas writer are: {kwargs}.',
                         kwargs=self.kwargs)

        return StatusCode.Success
示例#2
0
    def test_esk409(self):
        """Test Esk-409: Unredeemed vouchers."""
        # run Eskapade
        macro = resources.tutorial('esk409_unredeemed_vouchers.py')
        self.eskapade_run(macro)
        ds = process_manager.service(DataStore)

        # check generated data
        self.assertIn('voucher_redeems', ds)
        self.assertIn('voucher_ages', ds)
        self.assertIsInstance(ds['voucher_redeems'], ROOT.RooDataSet)
        self.assertIsInstance(ds['voucher_ages'], ROOT.RooDataSet)
        self.assertLess(ds['voucher_redeems'].numEntries(), 6000)
        self.assertGreater(ds['voucher_redeems'].numEntries(), 0)
        self.assertEqual(ds['voucher_ages'].numEntries(), 10000)

        # check fit result
        fit_link = process_manager.get('Fitting').get('Fit')
        self.assertEqual(fit_link.fit_result.status(), 0)
        n_ev_pull = (fit_link.results['n_ev'][0] -
                     6000.) / fit_link.results['n_ev'][1]
        self.assertGreater(n_ev_pull, -3.)
        self.assertLess(n_ev_pull, 3.)

        # check plot output
        plot_path = persistence.io_path('results_data', 'voucher_redeem.pdf')
        self.assertTrue(os.path.exists(plot_path))
        statinfo = os.stat(plot_path)
        self.assertGreater(statinfo.st_size, 0)
示例#3
0
    def test_esk411(self):
        """Test Esk-411: Predictive maintenance Weibull fit."""
        # run Eskapade
        macro = resources.tutorial('esk411_weibull_predictive_maintenance.py')
        self.eskapade_run(macro)
        ds = process_manager.service(DataStore)
        ws = process_manager.service(RooFitManager).ws

        # roofit objects check in datastore
        self.assertIn('fit_result', ds)
        self.assertIsInstance(ds['fit_result'], ROOT.RooFitResult)

        # roofit objects check in workspace
        self.assertIn('binnedData', ds)
        self.assertIsInstance(ds['binnedData'], ROOT.RooDataHist)
        mdata = ds['binnedData']
        self.assertTrue(mdata)
        self.assertEqual(300, mdata.numEntries())
        mpdf = ws.pdf('sum3pdf')
        self.assertTrue(mpdf)

        # successful fit result
        fit_result = ds['fit_result']
        self.assertEqual(0, fit_result.status())
        self.assertEqual(3, fit_result.covQual())

        n1 = ws.var('N1')
        self.assertTrue(n1)
        self.assertGreater(n1.getVal(), 2.e5)
        n2 = ws.var('N2')
        self.assertTrue(n2)
        self.assertGreater(n2.getVal(), 4.e5)
        n3 = ws.var('N3')
        self.assertTrue(n3)
        self.assertGreater(n3.getVal(), 5.e4)

        # data-summary checks
        file_names = [
            'weibull_fit_report.tex', 'correlation_matrix_fit_result.pdf',
            'floating_pars_fit_result.tex',
            'fit_of_time_difference_medium_range.pdf'
        ]
        for fname in file_names:
            path = persistence.io_path('results_data',
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertGreater(statinfo.st_size, 0)
    def test_esk608(self):
        """Test Esk-608: Execute Spark histogram filling macro."""
        # check if required Python and Java libraries are made available to worker nodes
        sc = process_manager.service(SparkManager).get_session().sparkContext
        self.assertRegex(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )
        self.assertRegex(
            sc.getConf().get('spark.jars.packages', ''),
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4',
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail'
        )

        # run Eskapade
        self.eskapade_run(resources.tutorial('esk608_spark_histogrammar.py'))
        ds = process_manager.service(DataStore)

        # check data frame
        self.assertIn('spark_df', ds,
                      'no object with key "spark_df" in data store')
        self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame,
                              '"spark_df" is not a Spark data frame')
        self.assertEqual(ds['spark_df'].count(), 12,
                         'unexpected number of rows in data frame')
        self.assertListEqual(sorted(ds['spark_df'].columns),
                             sorted(['date', 'loc', 'x', 'y']),
                             'unexpected columns in data frame')

        # data-generation checks
        self.assertIn('hist', ds)
        self.assertIsInstance(ds['hist'], dict)
        col_names = ['date', 'x', 'y', 'loc', 'x:y']
        self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names))

        # data-summary checks
        f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y']
        file_names = ['report.tex'
                      ] + ['hist_{}.pdf'.format(col) for col in f_bases]
        for fname in file_names:
            path = persistence.io_path('results_data',
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
示例#5
0
    def test_esk401(self):
        """Test Esk-401: ROOT hist fill, plot, convert."""
        # run Eskapade
        self.eskapade_run(
            resources.tutorial('esk401_roothist_fill_plot_convert.py'))
        ds = process_manager.service(DataStore)

        # histogram checks
        self.assertIn('hist', ds)
        self.assertIsInstance(ds['hist'], dict)
        columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'x1:x2', 'x2:x3', 'x4:x5']
        self.assertListEqual(sorted(ds['hist'].keys()), sorted(columns))
        for col in columns:
            self.assertIsInstance(ds['hist'][col], ROOT.TH1)

        # data-generation checks
        self.assertIn('n_correlated_data', ds)
        self.assertEqual(500, ds['n_correlated_data'])
        self.assertIn('n_rdh_x1', ds)
        self.assertEqual(40, ds['n_rdh_x1'])
        self.assertIn('n_rds_x2_vs_x3', ds)
        self.assertEqual(23, ds['n_rds_x2_vs_x3'])

        # roofit objects check
        self.assertIn('hpdf', ds)
        self.assertIsInstance(ds['hpdf'], ROOT.RooHistPdf)
        self.assertIn('rdh_x1', ds)
        self.assertIsInstance(ds['rdh_x1'], ROOT.RooDataHist)
        self.assertIn('rds_x2_vs_x3', ds)
        self.assertIsInstance(ds['rds_x2_vs_x3'], ROOT.RooDataSet)
        self.assertIn('vars_x2_vs_x3', ds)
        self.assertIsInstance(ds['vars_x2_vs_x3'], ROOT.RooArgSet)

        # data-summary checks
        file_names = ['report.tex'] + [
            'hist_{}.pdf'.format(col.replace(':', '_vs_')) for col in columns
        ]
        for fname in file_names:
            path = persistence.io_path('results_data',
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
示例#6
0
    def test_esk305(self):
        settings = process_manager.service(ConfigObject)
        settings['batchMode'] = True

        self.eskapade_run(resources.tutorial('esk305_correlation_summary.py'))

        ds = process_manager.service(DataStore)

        # input data checks
        all_col_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'Unnamed: 5']

        self.assertIn('input_data', ds)
        self.assertIsInstance(ds['input_data'], pd.DataFrame)
        self.assertListEqual(list(ds['input_data'].columns), all_col_names)

        self.assertIn('correlations', ds)
        self.assertIsInstance(ds['correlations'], list)
        corr_list = ds['correlations']
        self.assertEqual(5, len(corr_list))

        # correlation matrix checks
        col_names = ['x1', 'x2', 'x3', 'x4', 'x5']

        for corr in corr_list:
            self.assertIsInstance(corr, pd.DataFrame)
            # self.assertListEqual(list(corr.columns), col_names)
            self.assertListEqual(list(corr.index), col_names)

        # heatmap pdf checks
        results_path = persistence.io_path('results_data', 'report')

        correlations = ['pearson', 'kendall', 'spearman', 'correlation_ratio', 'phik']
        for corr in correlations:
            path = '{0:s}/correlations_input_data_{1:s}.pdf'.format(results_path, corr)
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
示例#7
0
 def _process_results_path(self):
     """Process results_path argument."""
     if not self.results_path:
         self.results_path = persistence.io_path('results_data', 'report')
     persistence.create_dir(self.results_path)
##########################################################################
# --- now set up the chains and links based on configuration flags

spark_streaming = Chain('SparkStreaming')

# the word count example
wordcount_link = SparkStreamingWordCount(name='SparkStreamingWordCount',
                                         read_key='dstream',
                                         store_key='wordcounts')
spark_streaming.add(wordcount_link)

# store output
writer_link = SparkStreamingWriter(
    name='SparkStreamingWriter',
    read_key=wordcount_link.store_key,
    output_path='file:' +
    persistence.io_path('results_data', '/dstream/wordcount'),
    suffix='txt',
    repartition=1)

spark_streaming.add(writer_link)

# start/stop of Spark Streaming
control_link = SparkStreamingController(name='SparkStreamingController',
                                        timeout=10)
spark_streaming.add(control_link)

##########################################################################

logger.debug('Done parsing configuration file esk610_spark_streaming.')
示例#9
0
#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk304_df_boxplot'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

msg = r"""

The plots and latex files produced by link df_summary can be found in dir:
{path}
"""
logger.info(msg, path=persistence.io_path('results_data', 'report'))

COLUMNS = ['var_a', 'var_b', 'var_c']
SIZE = 10000
VAR_LABELS = dict(var_a='Variable A', var_b='Variable B', var_c='Variable C')
VAR_UNITS = dict(var_b='m/s')
GEN_CONF = dict(var_a=dict(choice=['alpha', 'beta', 'gamma'], dtype=str),
                var_b=dict(mean=3., std=1.),
                var_c=dict(choice=['delta', 'epsilon', 'zeta', 'eta'],
                           dtype=str))

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
data = Chain('Data')
from escore.logger import Logger, LogLevel
from escore.core import persistence

logger = Logger()

logger.debug('Now parsing configuration file esk111_load_datastore_from_file.')

# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk111_load_datastore_from_file'
settings['version'] = 0

ds = process_manager.service(DataStore)
ds['number'] = 1
file_path = persistence.io_path('proc_service_data', 'temp_datastore.pkl')
ds.persist_in_file(file_path)

# --- update the number
ds['number'] = 2

# --- Reload from the pickle file with:
# >>> ds = DataStore.import_from_file(file_path)

# --- now set up the chains and links

ch = Chain('Start')
link = core_ops.ImportDataStore(name='importds', path=file_path)
link.logger.log_level = LogLevel.DEBUG
ch.add(link)
    #      hand, e.g. csv, xlsx. To use the numpy reader one of the
    #      following should be true:
    #          * reader is {'numpy', 'np', 'npy', 'npz'}
    #          * path contains extensions {'npy', 'npz'}
    #          * param `file_type` is {'npy', 'npz'}
    #  restore_index : bool
    #        whether to store the index in the metadata. Default is
    #        False when the index is numeric, True otherwise.
    #  file_type : str | {'npy', 'npz'}
    #      when using the numpy reader. Optional, see reader for details.

    nw.add(
        ReadToDf(
            name='numpy_reader',
            key='reloaded_typed_data_np',
            path=persistence.io_path('results_data', 'tmp_tut_esk210.npy'),
        ))

# The dataframe has now been restored with the dtypes of the original df
# and it's index has been restored as it is non-numeric

#########################################################################################
# --- Feather reader - writer (R/W)

# The primary benefits of the Feather W/R are:
#    * Interoperability with R dataframes; this package was written
#         to make sharing data between R and Python much easier
#    * High performance; relies on the apache Arrow framework
#    * Data type restoration
#    * Non-numerical index restoration
示例#12
0
    def initialize(self):
        """Initialize the link."""
        # check input arguments
        self.check_arg_types(allow_none=True,
                             read_key=str,
                             output_path=str,
                             compression_codec=str)
        self.check_arg_types(mode=str, sep=str, num_files=int)
        self.check_arg_types(recurse=True, allow_none=True)
        self.check_arg_vals('read_key', 'sep')
        self.check_arg_vals('output_path',
                            'compression_codec',
                            allow_none=True)
        self.check_arg_opts(mode=('overwrite', 'ignore', 'error'))
        if self.num_files < 1:
            raise RuntimeError(
                'Requested number of files is less than 1 ({:d}).'.format(
                    self.num_files))

        # set other attributes
        self.do_execution = True

        # set default output path
        if not self.output_path:
            self.output_path = persistence.io_path(
                'results_data', '{}_output'.format(self.name))

        # parse header argument
        try:
            self.header = tuple(self.header)
        except TypeError:
            self.header = bool(self.header)
        if isinstance(self.header, tuple) and not self.header:
            raise RuntimeError('Empty header sequence specified.')

        # check output directory
        if self.output_path.startswith('file:/'):
            output_path = os.path.abspath(
                self.output_path.replace('file:/', '/'))
            if os.path.exists(output_path):
                # output data already exist
                if self.mode == 'ignore':
                    # do not execute link
                    self.logger.debug(
                        'Output data already exist; not executing link.')
                    self.do_execution = False
                    return StatusCode.Success
                elif self.mode == 'error':
                    # raise exception
                    raise RuntimeError('Output data already exist.')

                # remove output directory
                if not os.path.isdir(output_path):
                    raise RuntimeError(
                        'Output path "{}" is not a directory.'.format(
                            output_path))
                shutil.rmtree(output_path)
            elif not os.path.exists(os.path.dirname(output_path)):
                # create path up to the last component
                self.logger.debug('Creating output path "{path}".',
                                  path=output_path)
                os.makedirs(os.path.dirname(output_path))

        return StatusCode.Success