Exemplo n.º 1
0
    def test_execute(self):
        from eskapade import ProcessManager, DataStore
        from eskapade.core_ops.links import DsToDs

        ds = ProcessManager().service(DataStore)
        ds['test'] = 1
        ds_to_ds = DsToDs()
        ds_to_ds.readKey = 'test'
        ds_to_ds.storeKey = 'moved_test'
        ds_to_ds.execute()

        self.assertIn('moved_test', list(ds.keys()),
                      'new key not in datastore')
        self.assertNotIn('test', list(ds.keys()), 'old key still in datastore')
        self.assertEqual(ds['moved_test'], 1, 'key-value pair not consistent')
Exemplo n.º 2
0
    def test_execute(self):
        from eskapade import ProcessManager, DataStore
        from eskapade.analysis import ApplyFuncToDf

        # --- setup a dummy data frame
        df = pd.DataFrame({
            'a': ['aap', 'noot', 'mies'],
            'b': [0, 1, 2],
            'c': [0, 1, 1],
            'd': [1, 'a', None]
        })

        # --- setup datastore
        ds = ProcessManager().service(DataStore)
        ds['test_input'] = df

        # --- setup the link
        link = ApplyFuncToDf()
        link.add_columns = {'foo': 'bar'}
        link.read_key = 'test_input'
        link.store_key = 'test_output'
        link.execute()

        # --- the actual detests

        # stored at all?
        self.assertIn('test_output', list(ds.keys()), 'DataFrame not stored')

        # added a column?
        self.assertIn('foo', ds['test_output'].columns,
                      'Column not added to DataFrame')
Exemplo n.º 3
0
    def execute(self):
        """ Execute WriteFromDf

        Pick up the dataframe and write to disk.
        """

        ds = ProcessManager().service(DataStore)

        # check that all dataframes are present
        assert all(
            k in list(ds.keys())
            for k in list(self.dictionary.keys())), 'key(s) not in DataStore.'

        # check that all ds items are dataframes
        assert all(isinstance(ds[k],pd.DataFrame) for k in list(self.dictionary.keys())), \
            'key(s) is not a pandas DataFrame.'

        # collect writer and store the dataframes
        for k in list(self.dictionary.keys()):
            df = ds[k]
            path = self.dictionary[k]
            if self.add_counter_to_name:
                ps = os.path.splitext(path)
                path = ps[0] + '_' + str(self._counter) + ps[1]
            writer = pandasWriter(path, self.writer)
            folder = os.path.dirname(path)
            self.log().debug('Checking for directory: %s', folder)
            if not os.path.exists(folder):
                self.log().fatal('Path given is invalid.')
            self.log().debug('Writing file: %s' % (path))
            writer(df, path, **self.kwargs)

        self._counter += 1
        return StatusCode.Success
Exemplo n.º 4
0
    def execute(self):
        """Execute SparkExecuteQuery"""

        self.log().debug('Applying following SQL-query to object(s) in DataStore: {0:s}'.format(self.query))

        ds = ProcessManager().service(DataStore)

        # register all objects in DataStore as SQL temporary views
        for ds_key in ds.keys():
            spark_df = ds[ds_key]
            spark_df.createOrReplaceTempView(ds_key)

        # get existing SparkSession
        spark = ProcessManager().service(spark_analysis.SparkManager).get_session()

        # apply SQL-query to temporary view(s)
        result = spark.sql(self.query)

        # store dataframe schema
        self.schema = result.schema

        # convert to different data format if required
        if self.output_format == 'rdd':
            # convert to RDD of tuples
            result = result.rdd.map(tuple)
        elif self.output_format == 'pd':
            # convert to Pandas dataframe
            result = result.toPandas()

        ds[self.store_key] = result

        return StatusCode.Success
Exemplo n.º 5
0
    def execute(self):
        """Execute DsObjectDeleter"""

        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        # used in code testing only
        if settings.get('TESTING'):
            self.log().warning(
                'Running in TESTING mode. NOT clearing datastore for testing purposes.'
            )
            return StatusCode.Success

        # delete specific items
        for key in self.deletionKeys:
            if key in ds:
                self.log().debug('Now deleting datastore object with key "%s"',
                                 key)
                del ds[key]

        # delete specific class types
        for cls in self.deletionClasses:
            for key in ds:
                if isinstance(ds[key], cls):
                    self.log().debug(
                        'Now deleting datastore object with key "%s"', key)
                    del ds[key]

        # delete all but specific items
        if len(self.keepOnly):
            keys = list(ds.keys())
            for key in keys:
                if key not in self.keepOnly:
                    self.log().debug(
                        'Now deleting datastore object with key "%s"', key)
                    del ds[key]

        # delete all items in datastore
        if self.clearAll:
            keys = list(ds.keys())
            for key in keys:
                self.log().debug('Now deleting datastore object with key "%s"',
                                 key)
                del ds[key]

        return StatusCode.Success
Exemplo n.º 6
0
    def test_execute(self):
        from eskapade import ProcessManager, DataStore
        from eskapade.core_ops.links import AssertInDs

        ds = ProcessManager().service(DataStore)
        ds['test1'] = pd.DataFrame([1], columns=['data'])
        ds['test2'] = pd.DataFrame([2], columns=['data'])
        ds['test3'] = pd.DataFrame([3], columns=['data'])
        aids = AssertInDs()

        aids.keySet = ['test1', 'test2', 'test3']

        aids.initialize()
        aids.execute()
        aids.finalize()

        # There is no output to test against.
        self.assertIn('test1', list(ds.keys()), 'dataframe not in datastore')
        self.assertIn('test2', list(ds.keys()), 'dataframe not in datastore')
        self.assertIn('test3', list(ds.keys()), 'dataframe not in datastore')
Exemplo n.º 7
0
    def execute(self):
        """ Execute AssignRandomClass """

        ds = ProcessManager().service(DataStore)

        # basic checks on contensts of the data frame
        assert self.readKey in list(
            ds.keys()), 'Key %s not in DataStore.' % self.readKey
        df = ds[self.readKey]
        if not isinstance(df, DataFrame):
            raise Exception('Retrieved object not of type pandas DataFrame.')
        ndf = len(df.index)
        assert ndf > 0, 'dataframe %s is empty.' % self.readKey
        if self.column in df.columns:
            raise Exception(
                'Column name <%s> already used: <%s>. Will not overwrite.' %
                (self.column, str(df.columns)))

        # fix final number of events assigned per random class
        # ... each class gets at least one event
        if self.nevents is not None:
            if len(self.nevents) == self.nclasses - 1:
                self.nevents.append(ndf - sum(n for n in self.nevents))
        if self.nevents is None:
            self.nevents = [int(ndf * f) for f in self.fractions]
            pass
        for i in range(self.nclasses):
            nsum = sum(n for n in self.nevents[:i + 1])
            ndiff = 0 if (nsum - ndf < 0) else (nsum - ndf)
            self.nevents[i] -= ndiff
            if self.nevents[i] < 0:
                self.nevents[i] = 0
            pass
        for i, n in enumerate(self.nevents):
            assert n >= 0, 'Random class <%d> assigned nevents <%d> needs to be greater than zero. %s' % \
                                                                                        (i, n, str(self.nevents))
            self.log().info('Random class <%d> assigned n events <%d>.' %
                            (i, n))

        # random reshuffling of dataframe indices
        settings = ProcessManager().service(ConfigObject)
        RNG = RandomState(settings['seed'])
        permute = RNG.permutation(df.index)

        # apply the random reshuffling, and assign records to the n classes
        df[self.column] = 0
        for i in range(self.nclasses):
            ib = sum(n for n in self.nevents[:i])
            ie = sum(n for n in self.nevents[:i + 1])
            df.ix[permute[ib:ie], self.column] = i
            pass

        return StatusCode.Success
Exemplo n.º 8
0
    def execute(self):
        """Create a report of the data frame variables

        * create statistics object for column
        * create overview table of column variable
        * plot histogram of column variable
        * store plot

        :returns: execution status code
        :rtype: StatusCode
        """

        # fetch and check input data frame
        hist_dict = ProcessManager().service(DataStore).get(
            self.read_key, None)
        if not isinstance(hist_dict, dict):
            self.log().critical(
                'no histograms "%s" found in data store for %s', self.read_key,
                str(self))
            raise RuntimeError('no input data found for %s' % str(self))

        if self.hist_keys is None:
            self.hist_keys = hist_dict.keys()

        # create report page for histogram
        self.pages = []
        for name in self.hist_keys:
            # histogram name
            self.log().info('processing histogram "%s"', name)

            # check if histogram is in dict
            if name not in hist_dict:
                self.log().warning('histogram "%s" not in dictionary "%s"',
                                   name, self.read_key)
                continue
            hist = hist_dict[name]

            # process each histogram (plot and make summary table)
            if hist.n_dim == 1:
                self.process_1d_histogram(name, hist)
            elif hist.n_dim == 2:
                self.process_2d_histogram(name, hist)

        # write out accumulated histogram statistics into report file
        with open('{}/report.tex'.format(self.results_path),
                  'w') as report_file:
            report_file.write(
                self.report_template.replace('INPUT_PAGES',
                                             ''.join(self.pages)))

        return StatusCode.Success
Exemplo n.º 9
0
    def test_esk104(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk104_basic_datastore_operations.py'

        status = execution.run_eskapade(settings)

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        self.assertTrue(status.isSuccess())
        self.assertEqual(1, len(ds.keys()))
        self.assertEqual(1, ds['a'])
Exemplo n.º 10
0
    def test_esk107(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings[
            'macro'] = settings['esRoot'] + '/tutorials/esk107_chain_looper.py'

        status = execution.run_eskapade(settings)

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        # chain is repeated 10 times, with nothing put in datastore
        self.assertTrue(status.isSuccess())
        self.assertEqual(0, len(ds.keys()))
        self.assertEqual(10, pm.chains[0].links[1].maxcount)
Exemplo n.º 11
0
    def test_esk110(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk110_code_profiling.py'

        status = execution.run_eskapade(settings)

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        self.assertTrue(status.isSuccess())
        self.assertEqual(0, len(pm.chains))
        self.assertEqual(0, len(ds.keys()))
        self.assertTrue('doCodeProfiling' in settings)
        self.assertEqual('cumulative', settings['doCodeProfiling'])
Exemplo n.º 12
0
    def test_esk105bc(self):
        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk105_B_store_each_chain.py'

        status = execution.run_eskapade(settings)

        pm = ProcessManager()
        settings = ProcessManager().service(ConfigObject)
        ds = ProcessManager().service(DataStore)

        # results of all three chains have been persisted
        self.assertTrue(status.isSuccess())
        path = '{0:s}/{1:s}/proc_service_data/v0/_chain{{:d}}/{2:s}.pkl'.format(
            settings['resultsDir'], settings['analysisName'], str(DataStore))
        for path_it in range(1, 4):
            self.assertTrue(os.path.exists(path.format(path_it)))

        execution.reset_eskapade()

        settings = ProcessManager().service(ConfigObject)
        settings['logLevel'] = definitions.LOG_LEVELS['DEBUG']
        settings['macro'] = settings[
            'esRoot'] + '/tutorials/esk105_C_begin_at_chain3.py'

        status = execution.run_eskapade(settings)

        ds = ProcessManager().service(DataStore)

        # object from all three chains are present
        self.assertTrue(status.isSuccess())
        self.assertTrue('f' in ds)
        self.assertTrue('g' in ds)
        self.assertTrue('h' in ds)
        self.assertEqual(3, len(ds.keys()))
        self.assertEqual(7, ds['f']['n_favorite'])
        self.assertEqual(1, ds['g']['a'])
        self.assertEqual(7, ds['h'][1])
Exemplo n.º 13
0
    def execute(self):
        """Execute link"""

        ds = ProcessManager().service(DataStore)
        assert self.read_key in list(ds.keys()), 'key <%s> not in DataStore.' % self.read_key
        df = ds[self.read_key]

        for arr in self.apply_funcs:
            # get func input
            keys = list(arr.keys())
            assert 'func' in keys, 'function input is insufficient.'
            func = arr['func']
            self.log().debug('Applying function %s' % str(func))
            args = ()
            kwargs = {}
            if 'kwargs' in keys:
                kwargs = arr['kwargs']
            if 'args' in keys:
                args = arr['args']

            # apply func
            if 'groupby' in keys:
                groupby = arr['groupby']
                if 'groupbyColout' in keys:
                    kwargs['groupbyColout'] = arr['groupbyColout']
                df = self.groupbyapply(df, groupby, func, *args, **kwargs)
            elif 'storekey' in keys:
                if 'entire' in keys:
                    result = func(df, *args, **kwargs)
                elif 'colin' in keys:
                    colin = arr['colin']
                    assert colin in df.columns
                    result = df[colin].apply(func, args=args, **kwargs)
                else:
                    result = df.apply(func, args=args, **kwargs)
                ds[arr['storekey']] = result
            else:
                assert 'colout' in keys, 'function input is insufficient'
                colout = arr['colout']
                if 'entire' in keys:
                    df[colout] = func(df, *args, **kwargs)
                elif 'colin' in keys:
                    colin = arr['colin']
                    if isinstance(colin, list):
                        for c in colin:
                            assert c in df.columns
                    else:
                        assert colin in df.columns
                    df[colout] = df[colin].apply(func, args=args, **kwargs)
                else:
                    df[colout] = df.apply(func, args=args, **kwargs)

        # add columns
        if self.add_columns is not None:
            for k, v in self.add_columns.items():
                df[k] = v

        if self.store_key is None:
            ds[self.read_key] = df
        else:
            ds[self.store_key] = df

        return StatusCode.Success
Exemplo n.º 14
0
    def execute(self):
        """ Execute ApplySelectionToDf

        Applies queries or column selection to a pandas DataFrame.
        Input dataframe is not overwritten, unless told to do so in kwargs.

        1. Apply queries, in order of provided query list.
        2. Select columns (if provided). 
        """

        ds = ProcessManager().service(DataStore)
        assert self.readKey in list(
            ds.keys()), 'Key %s not in DataStore.' % self.readKey
        assert isinstance(
            ds[self.readKey], pd.DataFrame
        ), 'Object with key %s is not a pandas DataFrame.' % self.readKey

        # 1. apply queries to input dataframe.
        #    input dataframe is not overwritten, unless told to do so in kwargs.
        do_continue = True
        if len(self.querySet):
            # apply first query
            query = self.querySet[0]
            try:
                df = ds[self.readKey].query(query, **self.kwargs)
            except:
                if not self.continueIfFailure:
                    raise ValueError(
                        'Failed to apply query <%s> to dataframe <%s>.' %
                        (query, self.readKey))
                else:
                    orig_df_cols = (ds[self.readKey]).columns
                    df = pd.DataFrame(columns=orig_df_cols)
                    do_continue = False
            # apply rest of the queries if any
            if do_continue:
                for query in self.querySet[1:]:
                    try:
                        df = df.query(query, **self.kwargs)
                    except:
                        if not self.continueIfFailure:
                            raise ValueError(
                                'Failed to apply query <%s> to dataframe <%s>.'
                                % (query, self.readKey))
                        else:
                            orig_df_cols = (ds[self.readKey]).columns
                            df = pd.DataFrame(columns=orig_df_cols)
                            break

        # 2. apply column selection to input dataframe.
        #    input dataframe is not overwritten.
        if len(self.selectColumns):
            if not 'df' in vars():
                df = (ds[self.readKey]).copy(deep=False)
            try:
                df = df[self.selectColumns]
            except:
                if not self.continueIfFailure:
                    raise ValueError(
                        'Failed to select columns <%s> of dataframe <%s>.' %
                        (str(self.selectColumns), self.readKey))
                else:
                    df = pd.DataFrame(columns=self.selectColumns)

        assert 'df' in vars(), 'No dataframe available for storage?'

        ds[self.storeKey] = df
        ds['n_' + self.storeKey] = len(df.index)

        self.log().info('Stored dataframe with key <%s> and length <%d>.' %
                        (self.storeKey, len(df.index)))

        return StatusCode.Success