Пример #1
0
    def initialize(self):
        """Initialize CorrelationSummary"""

        # check input arguments
        self.check_arg_types(read_key=str,
                             store_key=str,
                             results_path=str,
                             methods=list,
                             pages_key=str)
        self.check_arg_vals('read_key')

        # get I/O configuration
        io_conf = ProcessManager().service(ConfigObject).io_conf()

        # read report templates
        with open(
                persistence.io_path('templates', io_conf,
                                    'df_summary_report.tex')) as templ_file:
            self.report_template = templ_file.read()
        with open(
                persistence.io_path(
                    'templates', io_conf,
                    'df_summary_report_page.tex')) as templ_file:
            self.page_template = templ_file.read()

        # get path to results directory
        if not self.results_path:
            self.results_path = persistence.io_path('results_data', io_conf,
                                                    'report')

        # check if output directory exists
        if os.path.exists(self.results_path):
            # check if path is a directory
            if not os.path.isdir(self.results_path):
                self.log().critical('output path "%s" is not a directory',
                                    self.results_path)
                raise AssertionError('output path is not a directory')
        else:
            # create directory
            self.log().debug('Making output directory "%s"', self.results_path)
            os.makedirs(self.results_path)

        # check methods
        for method in self.methods:
            if method not in ALL_CORRS:
                logstring = '"{}" is not a valid correlation method, please use one of {}'
                logstring = logstring.format(
                    method, ', '.join(['"' + m + '"' for m in ALL_CORRS]))
                raise AssertionError(logstring)

        # initialize attributes
        self.pages = []

        return StatusCode.Success
Пример #2
0
    def test_esk409(self):
        """Test Esk-409: Unredeemed vouchers."""
        # run Eskapade
        macro = resources.tutorial('esk409_unredeemed_vouchers.py')
        self.eskapade_run(macro)
        ds = process_manager.service(DataStore)

        # check generated data
        self.assertIn('voucher_redeems', ds)
        self.assertIn('voucher_ages', ds)
        self.assertIsInstance(ds['voucher_redeems'], ROOT.RooDataSet)
        self.assertIsInstance(ds['voucher_ages'], ROOT.RooDataSet)
        self.assertLess(ds['voucher_redeems'].numEntries(), 6000)
        self.assertGreater(ds['voucher_redeems'].numEntries(), 0)
        self.assertEqual(ds['voucher_ages'].numEntries(), 10000)

        # check fit result
        fit_link = process_manager.get('Fitting').get('Fit')
        self.assertEqual(fit_link.fit_result.status(), 0)
        n_ev_pull = (fit_link.results['n_ev'][0] -
                     6000.) / fit_link.results['n_ev'][1]
        self.assertGreater(n_ev_pull, -3.)
        self.assertLess(n_ev_pull, 3.)

        # check plot output
        plot_path = persistence.io_path('results_data', 'voucher_redeem.pdf')
        self.assertTrue(os.path.exists(plot_path))
        statinfo = os.stat(plot_path)
        self.assertGreater(statinfo.st_size, 0)
Пример #3
0
    def initialize(self):
        """Initialize the link."""
        # perform basic checks of configured attributes
        # a key and path OR dictionary need to have been set.
        if self.path and self.key:
            self.dictionary = {self.key: self.path}
        elif not self.dictionary:
            raise Exception('Path and key OR dictionary not properly set.')

        # correct the output paths, if need be
        paths = list(self.dictionary.values())
        assert '' not in paths, 'One or more of the paths in dict is empty.'
        assert all([isinstance(p, str) for p in paths
                    ]), 'One or more of the paths in dict is not string.'
        # update paths if needed
        for k, p in self.dictionary.items():
            if not p.__contains__('/'):
                self.dictionary[k] = persistence.io_path('results_data', p)
                self.logger.debug(
                    'Output filename for key <{key}> has been reset to {new_key}.',
                    key=k,
                    new_key=self.dictionary[k])
        self.logger.info('kwargs passed on to pandas writer are: {kwargs}.',
                         kwargs=self.kwargs)

        return StatusCode.Success
Пример #4
0
    def test_esk608(self):
        """Test Esk-608: Execute Spark histogram filling macro"""

        # check if required Python and Java libraries are made available to worker nodes
        sc = ProcessManager().service(SparkManager).get_session().sparkContext
        self.assertRegexpMatches(
            sc.getConf().get('spark.master', ''), 'local\[[.*]\]',
            'Spark not running in local mode, required for testing with local files'
        )
        self.assertRegexpMatches(
            sc.getConf().get('spark.jars.packages', ''),
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4',
            'org.diana-hep:histogrammar-sparksql_2.11:1.0.4 missing from spark.jars.packages, test_esk608 will fail'
        )
        if re.search('spark://', sc.getConf().get('spark.master', '')):
            py_mods = utils.get_file_path('py_mods')
            self.assertRegexpMatches(
                sc.getConf().get('spark.submit.pyFiles', ''), py_mods,
                'Eskapade modules missing from spark.submit.pyFiles, needed in Spark cluster mode'
            )
            self.assertRegexpMatches(
                sc.getConf().get('spark.files', ''), py_mods,
                'Eskapade modules missing from spark.files, needed in Spark cluster mode'
            )

        # run Eskapade
        self.run_eskapade('esk608_spark_histogrammar.py')
        proc_mgr = ProcessManager()
        ds = proc_mgr.service(DataStore)
        settings = ProcessManager().service(ConfigObject)

        # check data frame
        self.assertIn('spark_df', ds,
                      'no object with key "spark_df" in data store')
        self.assertIsInstance(ds['spark_df'], pyspark.sql.DataFrame,
                              '"spark_df" is not a Spark data frame')
        self.assertEqual(ds['spark_df'].count(), 12,
                         'unexpected number of rows in data frame')
        self.assertListEqual(sorted(ds['spark_df'].columns),
                             sorted(['date', 'loc', 'x', 'y']),
                             'unexpected columns in data frame')

        # data-generation checks
        self.assertIn('hist', ds)
        self.assertIsInstance(ds['hist'], dict)
        col_names = ['date', 'x', 'y', 'loc', 'x:y']
        self.assertListEqual(sorted(ds['hist'].keys()), sorted(col_names))

        # data-summary checks
        f_bases = ['date', 'x', 'y', 'loc', 'x_vs_y']
        file_names = ['report.tex'
                      ] + ['hist_{}.pdf'.format(col) for col in f_bases]
        for fname in file_names:
            path = persistence.io_path('results_data', settings.io_conf(),
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
Пример #5
0
    def run_eskapade(self,
                     macro,
                     return_status=definitions.StatusCode.Success):
        """Run Eskapade"""

        proc_mgr = ProcessManager()
        settings = proc_mgr.service(ConfigObject)
        settings['macro'] = persistence.io_path('macros', settings.io_conf(),
                                                macro)
        status = execution.run_eskapade(settings)
        self.assertTrue(status == return_status)
Пример #6
0
    def initialize(self):
        """Initialize SparkDataToCsv"""

        # check input arguments
        self.check_arg_types(allow_none=True, read_key=str, output_path=str, compression_codec=str)
        self.check_arg_types(mode=str, sep=str, num_files=int)
        self.check_arg_types(recurse=True, allow_none=True)
        self.check_arg_vals('read_key', 'sep')
        self.check_arg_vals('output_path', 'compression_codec', allow_none=True)
        self.check_arg_opts(mode=('overwrite', 'ignore', 'error'))
        if self.num_files < 1:
            raise RuntimeError('requested number of files is less than 1 ({:d})'.format(self.num_files))

        # set other attributes
        self.do_execution = True

        # set default output path
        if not self.output_path:
            settings = ProcessManager().service(ConfigObject)
            self.output_path = 'file:' + persistence.io_path('results_data', settings.io_conf(), '{}_output'.format(self.name))

        # parse header argument
        try:
            self.header = tuple(self.header)
        except TypeError:
            self.header = bool(self.header)
        if isinstance(self.header, tuple) and not self.header:
            raise RuntimeError('empty header sequence specified')
    
        # check output directory, if local
        if self.output_path.startswith('file:'):
            local_output_path = os.path.abspath(self.output_path.replace('file:',''))
            if os.path.exists(self.output_path):
                # output data already exist
                if self.mode == 'ignore':
                    # do not execute link
                    self.log().debug('Output data already exist; not executing link')
                    self.do_execution = False
                    return StatusCode.Success
                elif self.mode == 'error':
                    # raise exception
                    raise RuntimeError('output data already exist')

                # remove output directory
                if not os.path.isdir(local_output_path):
                    raise RuntimeError('output path "{}" is not a directory'.format(local_output_path))
                shutil.rmtree(local_output_path)
            elif not os.path.exists(os.path.dirname(local_output_path)):
                # create path up to the last component
                self.log().debug('Creating output path "%s"', local_output_path)
                os.makedirs(os.path.dirname(local_output_path))

        return StatusCode.Success
Пример #7
0
    def _create_spark_conf(self,
                           eskapade_settings=None,
                           config_path=None,
                           spark_settings=None):
        """Create and set Spark configuration
        Read the Spark configuration file and store the settings as a SparkConf
        object.  The path of the configuration file is given by the config_path
        argument or, if this argument is not specified, it is obtained from the
        Eskapade settings object (key "sparkCfgFile").  If neither of these
        inputs are provided, an empty configuration object is created.

        With the spark_settings argument, settings from the configuration file
        can be overwritten.  Also additional settings can be specified with this
        argument.

        :param str config_path: path of configuration file
        :param eskapade.ConfigObject es_settings_obj: Eskapade configuration (key "sparkCfgFile" for config path)
        :param iterable spark_settings: iterable of custom settings key-value pairs to be set
        """

        # set path of config file
        cfg_path = str(config_path) if config_path else str(
            eskapade_settings.get(
                'sparkCfgFile')) if eskapade_settings else None
        if cfg_path and eskapade_settings and not os.path.isabs(cfg_path):
            cfg_path = persistence.io_path('config_spark',
                                           eskapade_settings.io_conf(),
                                           cfg_path)
        if cfg_path and cfg_path != self.config_path:
            self.log().debug(
                'Setting configuration file path to "{}"'.format(cfg_path))
            self.config_path = cfg_path
            self.reset_config()

        # create Spark config
        spark_conf = pyspark.conf.SparkConf()

        # set settings from config file
        if self.config_path:
            cfg = self.get_config()
            if not CONF_PREFIX in cfg:
                raise RuntimeError(
                    'No section "{}" found in config file'.format(CONF_PREFIX))
            spark_conf.setAll(cfg.items(CONF_PREFIX))

        # set custom settings
        if spark_settings:
            spark_conf.setAll(spark_settings)

        return spark_conf
Пример #8
0
    def test_esk411(self):
        """Test Esk-411: Predictive maintenance Weibull fit"""

        # run Eskapade
        self.run_eskapade('esk411_weibull_predictive_maintenance.py')
        ds = ProcessManager().service(DataStore)
        ws = ProcessManager().service(RooFitManager).ws

        # roofit objects check in datastore
        self.assertIn('fit_result', ds)
        self.assertIsInstance(ds['fit_result'], ROOT.RooFitResult)

        # roofit objects check in workspace
        self.assertIn('binnedData', ds)
        self.assertIsInstance(ds['binnedData'], ROOT.RooDataHist)
        mdata = ds['binnedData']
        self.assertTrue(mdata)
        self.assertEqual(300, mdata.numEntries())
        mpdf = ws.pdf('sum3pdf')
        self.assertTrue(mpdf)

        # successful fit result
        fit_result = ds['fit_result']
        self.assertEqual(0, fit_result.status())
        self.assertEqual(3, fit_result.covQual())

        n1 = ws.var('N1')
        self.assertTrue(n1)
        self.assertGreater(n1.getVal(), 2.e5)
        n2 = ws.var('N2')
        self.assertTrue(n2)
        self.assertGreater(n2.getVal(), 4.e5)
        n3 = ws.var('N3')
        self.assertTrue(n3)
        self.assertGreater(n3.getVal(), 5.e4)

        # data-summary checks
        io_conf = ProcessManager().service(ConfigObject).io_conf()
        file_names = [
            'weibull_fit_report.tex', 'correlation_matrix_fit_result.pdf',
            'floating_pars_fit_result.tex',
            'fit_of_time_difference_medium_range.pdf'
        ]
        for fname in file_names:
            path = persistence.io_path('results_data', io_conf,
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertGreater(statinfo.st_size, 0)
Пример #9
0
    def initialize(self):
        """Initialize WsUtils"""

        # check input arguments
        self.check_arg_types(pages_key=str)

        if isinstance(self.copy_into_ws, str):
            self.copy_into_ws = [self.copy_into_ws]
        assert isinstance(self.copy_into_ws, list), 'copy_into_ws needs to be a string or list of strings.'

        if isinstance(self.copy_into_ds, str):
            self.copy_into_ds = [self.copy_into_ds]
        assert isinstance(self.copy_into_ds, list), 'copy_into_ds needs to be a string or list of strings.'

        # get I/O configuration
        io_conf = ProcessManager().service(ConfigObject).io_conf()

        # read report templates
        with open(core.persistence.io_path('templates', io_conf, 'df_summary_report.tex')) as templ_file:
            self.report_template = templ_file.read()
        with open(core.persistence.io_path('templates', io_conf, 'df_summary_report_page.tex')) as templ_file:
            self.page_template = templ_file.read()
        with open(persistence.io_path('templates', io_conf, 'df_summary_table_page.tex')) as templ_file:
            self.table_template = templ_file.read()

        # get path to results directory
        if not self.results_path:
            # get I/O configuration
            io_conf = ProcessManager().service(ConfigObject).io_conf()
            self.results_path = core.persistence.io_path('results_data', io_conf, 'report')

        # check if output directory exists
        if os.path.exists(self.results_path):
            # check if path is a directory
            if not os.path.isdir(self.results_path):
                self.log().critical('output path "%s" is not a directory', self.results_path)
                raise AssertionError('output path is not a directory')
        else:
            # create directory
            self.log().debug('Making output directory %s', self.results_path)
            os.makedirs(self.results_path)

        # make sure Eskapade RooFit library is loaded for fitting (for plotting correlation matrix)
        if self._fit:
            roofit_utils.load_libesroofit()

        return StatusCode.Success
Пример #10
0
    def test_esk305(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk305_correlation_summary.py'
        settings['batchMode'] = True

        status = execution.run_eskapade(settings)
        self.assertTrue(status.isSuccess())

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        # input data checks
        all_col_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'Unnamed: 5']

        self.assertIn('input_data', ds)
        self.assertIsInstance(ds['input_data'], pd.DataFrame)
        self.assertListEqual(list(ds['input_data'].columns), all_col_names)

        self.assertIn('correlations', ds)
        self.assertIsInstance(ds['correlations'], list)
        corr_list = ds['correlations']
        self.assertEqual(4, len(corr_list))

        # correlation matrix checks
        col_names = ['x1', 'x2', 'x3', 'x4', 'x5']

        for corr in corr_list:
            self.assertIsInstance(corr, pd.DataFrame)
            #self.assertListEqual(list(corr.columns), col_names)
            self.assertListEqual(list(corr.index), col_names)

        # heatmap pdf checks
        io_conf = settings.io_conf()
        results_path = persistence.io_path('results_data', io_conf, 'report')

        correlations = ['pearson', 'kendall', 'spearman', 'correlation_ratio']
        for corr in correlations:
            path = '{0:s}/correlations_input_data_{1:s}.pdf'.format(
                results_path, corr)
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
Пример #11
0
    def test_esk401(self):
        """Test Esk-401: ROOT hist fill, plot, convert"""

        # run Eskapade
        self.run_eskapade('esk401_roothist_fill_plot_convert.py')
        ds = ProcessManager().service(DataStore)

        # histogram checks
        self.assertIn('hist', ds)
        self.assertIsInstance(ds['hist'], dict)
        columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'x1:x2', 'x2:x3', 'x4:x5']
        self.assertListEqual(sorted(ds['hist'].keys()), sorted(columns))
        for col in columns:
            self.assertIsInstance(ds['hist'][col], ROOT.TH1)

        # data-generation checks
        self.assertIn('n_correlated_data', ds)
        self.assertEqual(500, ds['n_correlated_data'])
        self.assertIn('n_rdh_x1', ds)
        self.assertEqual(40, ds['n_rdh_x1'])
        self.assertIn('n_rds_x2_vs_x3', ds)
        self.assertEqual(23, ds['n_rds_x2_vs_x3'])

        # roofit objects check
        self.assertIn('hpdf', ds)
        self.assertIsInstance(ds['hpdf'], ROOT.RooHistPdf)
        self.assertIn('rdh_x1', ds)
        self.assertIsInstance(ds['rdh_x1'], ROOT.RooDataHist)
        self.assertIn('rds_x2_vs_x3', ds)
        self.assertIsInstance(ds['rds_x2_vs_x3'], ROOT.RooDataSet)
        self.assertIn('vars_x2_vs_x3', ds)
        self.assertIsInstance(ds['vars_x2_vs_x3'], ROOT.RooArgSet)

        # data-summary checks
        io_conf = ProcessManager().service(ConfigObject).io_conf()
        file_names = ['report.tex'] + [
            'hist_{}.pdf'.format(col.replace(':', '_vs_')) for col in columns
        ]
        for fname in file_names:
            path = persistence.io_path('results_data', io_conf,
                                       'report/{}'.format(fname))
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
Пример #12
0
    def initialize(self):
        """ Initialize WriteFromDf """

        # perform basic checks of configured attributes
        # a key and path need to have been set.
        if self.key == '' and self.path == '' and self.dictionary is None:
            raise Exception('Key, path and dictionary are not set.')
        if len(self.key) == 0 and len(self.dictionary) == 0:
            raise Exception('Key or dict has not been set.')
        if len(self.path) == 0 and len(self.dictionary) == 0:
            raise Exception('Output filename or dict has not been set. Exit.')
        else:
            assert self.path != '' and isinstance(self.path,
                                                  str), 'path not given.'
        if self.path and self.key:
            self.dictionary = {self.key: self.path}
        elif self.dictionary:
            pass
        else:
            raise Exception('Path and key OR dictionary not properly set.')

        # correct the output paths, if need be
        if self.dictionary:
            paths = list(self.dictionary.values())
            assert '' not in paths, 'One or more of the paths in dict is empty.'
            assert False not in [isinstance(p, str) for p in paths]
            # update paths if needed
            for k in self.dictionary.keys():
                p = self.dictionary[k]
                if not p.__contains__('/'):
                    io_conf = ProcessManager().service(ConfigObject).io_conf()
                    self.dictionary[k] = persistence.io_path(
                        'results_data', io_conf, p)
                    self.log().debug(
                        'Output filename for key <%s> has been reset to: %s' %
                        (k, self.dictionary[k]))

        self.log().info('kwargs passed on to pandas writer are: %s' %
                        self.kwargs)

        return StatusCode.Success
Пример #13
0
    def test_esk305(self):
        settings = process_manager.service(ConfigObject)
        settings['batchMode'] = True

        self.eskapade_run(resources.tutorial('esk305_correlation_summary.py'))

        ds = process_manager.service(DataStore)

        # input data checks
        all_col_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'Unnamed: 5']

        self.assertIn('input_data', ds)
        self.assertIsInstance(ds['input_data'], pd.DataFrame)
        self.assertListEqual(list(ds['input_data'].columns), all_col_names)

        self.assertIn('correlations', ds)
        self.assertIsInstance(ds['correlations'], list)
        corr_list = ds['correlations']
        self.assertEqual(4, len(corr_list))

        # correlation matrix checks
        col_names = ['x1', 'x2', 'x3', 'x4', 'x5']

        for corr in corr_list:
            self.assertIsInstance(corr, pd.DataFrame)
            # self.assertListEqual(list(corr.columns), col_names)
            self.assertListEqual(list(corr.index), col_names)

        # heatmap pdf checks
        results_path = persistence.io_path('results_data', 'report')

        correlations = ['pearson', 'kendall', 'spearman', 'correlation_ratio']
        for corr in correlations:
            path = '{0:s}/correlations_input_data_{1:s}.pdf'.format(
                results_path, corr)
            self.assertTrue(os.path.exists(path))
            statinfo = os.stat(path)
            self.assertTrue(statinfo.st_size > 0)
Пример #14
0
    def initialize(self):
        """Initialize CorrelationSummary"""

        # get I/O configuration
        io_conf = ProcessManager().service(ConfigObject).io_conf()

        # get path to results directory
        if not self.results_path:
            self.results_path = persistence.io_path('results_data', io_conf,
                                                    'report')

        # check if output directory exists
        if os.path.exists(self.results_path):
            # check if path is a directory
            if not os.path.isdir(self.results_path):
                self.log().critical('output path "%s" is not a directory',
                                    self.results_path)
                raise AssertionError('output path is not a directory')
        else:
            # create directory
            self.log().debug('Making output directory "%s"', self.results_path)
            os.makedirs(self.results_path)

        # check method
        if self.method not in ALL_CORRS:
            logstring = '"{}" is not a valid correlation method, please use one of {}; using "pearson"'
            logstring = logstring.format(
                self.method, ', '.join(['"' + m + '"' for m in ALL_CORRS]))
            self.log().error(logstring)
            self.method = 'pearson'

        # check input arguments
        self.check_arg_types(read_key=str, method=str)
        self.check_arg_vals('read_key')

        return StatusCode.Success
Пример #15
0
#########################################################################################
# --- minimal analysis information

settings = process_manager.service(ConfigObject)
settings['analysisName'] = 'esk411_weibull_predictive_maintenance'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

msg = r"""

The plots and latex report produced by link WsUtils can be found in dir:
{path}
"""
logger.info(msg, path=persistence.io_path('results_data', 'report'))

settings['generate'] = True
# settings['read_data'] = not settings['generate']
settings['model'] = True
settings['process'] = True
settings['fit_plot'] = True
settings['summary'] = True

fitpdf = 'sum3pdf'
n_percentile_bins = 300

#########################################################################################
# --- now set up the chains and links based on configuration flags

if settings['model']:
Пример #16
0
    def test_esk106_script(self, mock_argv):
        """Test Eskapade run with esk106 macro from script"""

        proc_mgr = ProcessManager()

        # get file paths
        settings = proc_mgr.service(ConfigObject)
        settings['analysisName'] = 'esk106_cmdline_options'
        settings_ = settings.copy()
        script_path = eskapade.utils.get_file_path('run_eskapade')
        macro_path = persistence.io_path('macros', settings.io_conf(),
                                         'esk106_cmdline_options.py')

        # import run-script module
        orig_mod_path = sys.path.copy()
        sys.path.append(os.path.dirname(script_path))
        script_mod = os.path.splitext(os.path.basename(script_path))[0]
        run_eskapade = importlib.import_module(script_mod)

        # mock command-line arguments
        args = []
        mock_argv.__getitem__ = lambda s, k: args.__getitem__(k)

        # base settings
        args_ = [script_path, macro_path, '-LDEBUG', '--batch-mode']
        settings_['macro'] = macro_path
        settings_['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings_['batchMode'] = True

        def do_run(name, args, args_, settings_, add_args, add_settings,
                   chains):
            # set arguments
            args.clear()
            args += args_ + add_args
            settings = settings_.copy()
            settings.update(add_settings)

            # run Eskapade
            proc_mgr.reset()
            run_eskapade.main()
            settings_run = proc_mgr.service(ConfigObject)

            # check results
            self.assertListEqual(
                [c.name for c in proc_mgr.chains], chains,
                'unexpected chain names in "{}" test'.format(name))
            self.assertDictEqual(
                settings_run, settings,
                'unexpected settings in "{}" test'.format(name))

        # run both chains
        do_run(
            'both chains', args, args_, settings_,
            ['--store-all', '-cdo_chain0=True', '-cdo_chain1=True'],
            dict(storeResultsEachChain=True, do_chain0=True,
                 do_chain1=True), ['Chain0', 'Chain1'])

        # run only last chain by skipping the first
        do_run('skip first', args, args_, settings_,
               ['-bChain1', '-cdo_chain0=True', '-cdo_chain1=True'],
               dict(beginWithChain='Chain1', do_chain0=True,
                    do_chain1=True), ['Chain0', 'Chain1'])

        # run only last chain by not defining the first
        do_run('no first', args, args_, settings_,
               ['-cdo_chain0=False', '-cdo_chain1=True'],
               dict(do_chain0=False, do_chain1=True), ['Chain1'])

        # restore module search path
        sys.path.clear()
        sys.path += orig_mod_path
Пример #17
0
log.debug('Now parsing configuration file esk305_correlation_summary')

#########################################################################################
# --- minimal analysis information

proc_mgr = ProcessManager()

settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk305_correlation_summary'
settings['version'] = 0

#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

settings['input_path'] = persistence.io_path('data', settings.io_conf(),
                                             'correlated_data.sv.gz')
settings['reader'] = 'csv'
settings['separator'] = ' '
settings['correlations'] = [
    'pearson', 'kendall', 'spearman', 'correlation_ratio'
]

#########################################################################################
# --- now set up the chains and links based on configuration flags

# create chains
proc_mgr.add_chain('Data')
proc_mgr.add_chain('Summary')

# load data
reader = analysis.ReadToDf(name='reader',
Пример #18
0
#########################################################################################
# --- minimal analysis information
proc_mgr = ProcessManager()
settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'Tutorial_5'

#########################################################################################
# --- setup Spark

proc_mgr.service(SparkManager).get_or_create_session()

#########################################################################################
# --- analysis values, settings, helper functions, configuration flags.

DATA_FILE_PATH = persistence.io_path('data', settings.io_conf(),
                                     'LAozone.data')
VAR_LABELS = dict(doy='Day of year',
                  date='Date',
                  vis='Visibility',
                  vis_km='Visibility')
VAR_UNITS = dict(vis='mi', vis_km='km')


def comp_date(day):
    """Get date/time from day of year"""

    import pandas as pd
    return pd.Timestamp('1976-01-01') + pd.Timedelta('{:d}D'.format(day - 1))


def mi_to_km(dist):
else:
    logger.error('unsupported stream_type specified: {type}.', type=stream_type)

##########################################################################
# --- now set up the chains and links based on configuration flags

spark_streaming = Chain('SparkStreaming')

# the word count example
wordcount_link = spark_analysis.SparkStreamingWordCount(
    name='SparkStreamingWordCount', read_key='dstream', store_key='wordcounts')
spark_streaming.add(wordcount_link)

# store output
writer_link = spark_analysis.SparkStreamingWriter(
    name='SparkStreamingWriter',
    read_key=wordcount_link.store_key,
    output_path='file:' + persistence.io_path('results_data', '/dstream/wordcount'),
    suffix='txt',
    repartition=1)

spark_streaming.add(writer_link)

# start/stop of Spark Streaming
control_link = spark_analysis.SparkStreamingController(name='SparkStreamingController', timeout=10)
spark_streaming.add(control_link)

##########################################################################

logger.debug('Done parsing configuration file esk610_spark_streaming.')
Пример #20
0
    def initialize(self):
        """Initialize UncorrelationHypothesisTester"""

        # check input arguments
        self.check_arg_types(read_key=str, significance_key=str, sk_significance_map=str, sk_residuals_map=str,
                             sk_residuals_overview=str, default_number_of_bins=int, nsims_per_significance=int, prefix=str,
                             z_threshold=float, pages_key=str, clientpages_key=str, hist_dict_key=str)
        self.check_arg_types(recurse=True, allow_none=True, columns=str)
        self.check_arg_types(recurse=True, allow_none=True, x_columns=str)
        self.check_arg_types(recurse=True, allow_none=True, y_columns=str)
        self.check_arg_types(recurse=True, allow_none=True, ignore_categories=str)
        self.check_arg_types(recurse=True, allow_none=True, var_ignore_categories=str)
        self.check_arg_vals('read_key')
        self.check_arg_vals('significance_key')

        if self.map_to_original and not isinstance(self.map_to_original, str) \
                and not isinstance(self.map_to_original, dict):
            raise TypeError('map_to_original needs to be a dict or string (to fetch a dict from the datastore)')

        # get I/O configuration
        io_conf = ProcessManager().service(ConfigObject).io_conf()

        # read report templates
        with open(persistence.io_path('templates', io_conf, 'df_summary_report.tex')) as templ_file:
            self.report_template = templ_file.read()
        with open(persistence.io_path('templates', io_conf, 'df_summary_report_page.tex')) as templ_file:
            self.page_template = templ_file.read()
        with open(persistence.io_path('templates', io_conf, 'df_summary_table_page.tex')) as templ_file:
            self.table_template = templ_file.read()

        # get path to results directory
        if not self.results_path:
            self.results_path = persistence.io_path('results_data', io_conf, 'report')
        if self.results_path and not self.results_path.endswith('/'):
            self.results_path = self.results_path + '/'

        # check if output directory exists
        if os.path.exists(self.results_path):
            # check if path is a directory
            if not os.path.isdir(self.results_path):
                self.log().critical('output path "%s" is not a directory', self.results_path)
                raise AssertionError('output path is not a directory')
        else:
            # create directory
            self.log().debug('Making output directory "%s"', self.results_path)
            os.makedirs(self.results_path)

        # prefix for file storage
        if self.prefix and not self.prefix.endswith('_'):
            self.prefix = self.prefix + '_'

        # check provided columns
        if len(self.columns):
            assert len(self.x_columns) == 0 and len(self.y_columns) == 0, \
                'Set either columns OR x_columns and y_columns.'
        if len(self.x_columns):
            assert len(self.columns) == 0 and len(self.y_columns) > 0, \
                'Set either columns OR x_columns and y_columns.'
        self._all_columns = []

        # check that var_ignore_categories are set correctly.
        for col, ic in self.var_ignore_categories.items():
            if isinstance(ic, str):
                self.var_ignore_categories[col] = [ic]
            elif not isinstance(ic, list):
                raise TypeError('var_ignore_categories key "%s" needs to be a string or list of strings' % col)

        # load roofit classes
        roofit_utils.load_libesroofit()

        return StatusCode.Success
Пример #21
0
log.debug('Now parsing configuration file esk209_read_big_data_itr')

#########################################################################################
# --- minimal analysis information
settings = ProcessManager().service(ConfigObject)
settings['analysisName'] = 'esk209_read_big_data_itr'
settings['version'] = 0

#########################################################################################

# when chunking through an input file, pick up only N lines in each iteration.
chunksize = 5

#########################################################################################
# --- Set path of data
data_path = persistence.io_path('data', settings.io_conf(), 'dummy.csv')

#########################################################################################
# --- now set up the chains and links, based on configuration flags

proc_mgr = ProcessManager()

# --- example 1: readdata loops over the input files, but no file chunking.

if settings.get('do_example1', True):
    ch = proc_mgr.add_chain('MyChain1')

    # --- a loop is set up in the chain MyChain.
    #     we iterate over (chunks of) the next file in the list until the iterator is done.
    #     then move on to the next chain (Overview)
Пример #22
0
 def _process_results_path(self):
     """Process results_path argument."""
     if not self.results_path:
         self.results_path = persistence.io_path('results_data', 'report')
     persistence.create_dir(self.results_path)
settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk604_spark_execute_query'
settings['version'] = 0

##########################################################################
# Start Spark session

spark = proc_mgr.service(SparkManager).create_session(
    eskapade_settings=settings)

##########################################################################
# CSV and dataframe settings

# NB: local file may not be accessible to worker node in cluster mode
file_paths = [
    'file:' + persistence.io_path('data', settings.io_conf(), 'dummy1.csv'),
    'file:' + persistence.io_path('data', settings.io_conf(), 'dummy2.csv')
]

# define store_key for all data files to be read in
STORE_KEYS = ['spark_df1', 'spark_df2']

##########################################################################
# Now set up the chains and links based on configuration flags

proc_mgr.add_chain('Read')

# create read link for each data file
for index, key in enumerate(STORE_KEYS):
    read_link = spark_analysis.SparkDfReader(name='Reader' + str(index + 1),
                                             store_key=key,
proc_mgr = ProcessManager()

settings = proc_mgr.service(ConfigObject)
settings['analysisName'] = 'esk411_weibull_predictive_maintenance'
settings['version'] = 0


#########################################################################################
# --- Analysis values, settings, helper functions, configuration flags.

msg = r"""

The plots and latex report produced by link WsUtils can be found in dir:
%s
""" % (persistence.io_path('results_data', settings.io_conf(), 'report'))
log.info(msg)

settings['generate'] = True
#settings['read_data'] = not settings['generate']
settings['model'] = True
settings['process'] = True
settings['fit_plot'] = True
settings['summary'] = True

fitpdf = 'sum3pdf'
n_percentile_bins = 300


#########################################################################################
# --- now set up the chains and links based on configuration flags