def execute(self):
        """ Execute MongoMoveCollection """

        process_manager.service(MongoConnection).set_config_info(process_manager.service(ConfigObject))
        self.mdb = process_manager.service(MongoConnection).database
        # check if collection names are in database
        colls = self.mdb.collection_names()
        if self.source_collection not in colls:
            raise Exception("%s is not a collection in the mongo database" % self.source_collection)

        if self.filter is not None:
            if isinstance(self.filter, dict):
                pass
            elif isinstance(self.filter, str):
                ds = process_manager.service(DataStore)
                assert self.filter in ds, 'Filter key <%s> not found in datastore.' % self.filter
                self.filter = ds[self.filter]
                assert isinstance(self.filter, dict), 'Filter with key <%s> is not a dict.' % self.filter
            else:
                raise Exception('Given filter of incorrect type.')

        if self.filter is not None:
            data = self.mdb[self.source_collection].find(self.filter)
        else:
            data = self.mdb[self.source_collection].find()
        df = pd.DataFrame(list(data))

        if len(df) == 0:
            self.logger.info('Source collection <{collection}> has zero length. Nothing to move.',
                                collection=self.source_collection)
            return StatusCode.Success

        if self.columnsToAdd is not None:
            for k, v in self.columnsToAdd.items():
                df[k] = v
        docs = list(df.T.to_dict().values())

        s = []
        for coll in self.target_collections:
            try:
                self.mdb[coll].insert_many(docs)
                s.append(coll)
                appliedstr = 'Copied' if self.copy else 'Moved'
                self.logger.info('{action} collection <{collection}> with length <{length}> to <{target}>.',
                                 action=appliedstr, collection=self.source_collection, length=len(docs), target=coll)
            except:
                for c in s:
                    self.mdb[c].delete_many(({'_id': {"$in": list(df._id)}}))
                raise Exception('Error in move: insertion in target collection %s failed' % coll)

        if not self.copy:
            try:
                self.mdb[self.source_collection].delete_many({'_id': {"$in": list(df._id)}})
            except:
                for c in s:
                    self.mdb[c].delete_many(({'_id': {"$in": list(df._id)}}))
                raise Exception('Error in move: deletion from source collection %s failed' % self.source_collection)

        return StatusCode.Success
示例#2
0
    def test_esk108reduce(self):
        settings = process_manager.service(ConfigObject)
        settings['TESTING'] = True
        self.eskapade_run(resources.tutorial('esk108_reduce.py'))

        ds = process_manager.service(DataStore)

        self.assertEqual(20, ds['n_products'])
示例#3
0
    def test_esk110(self):
        self.eskapade_run(resources.tutorial('esk110_code_profiling.py'))

        settings = process_manager.service(ConfigObject)
        ds = process_manager.service(DataStore)

        self.assertEqual(0, len(process_manager))
        self.assertEqual(0, len(ds))
        self.assertTrue('doCodeProfiling' in settings)
        self.assertEqual('cumulative', settings['doCodeProfiling'])
示例#4
0
def eskapade_run():
    """Run Eskapade.

    Top-level entry point for an Eskapade run started from the
    command line.  Arguments specified by the user are parsed and
    converted to settings in the configuration object.  Optionally, an
    interactive Python session is started when the run is finished.
    """
    from escore import process_manager, ConfigObject, DataStore
    from escore.core import execution
    from escore.core.run_utils import create_arg_parser

    # create parser for command-line arguments
    parser = create_arg_parser()
    user_args = parser.parse_args()

    # create config object for settings
    if not user_args.unpickle_config:
        # create new config
        settings = ConfigObject()
    else:
        # read previously persisted settings if pickled file is specified
        conf_path = user_args.config_files.pop(0)
        settings = ConfigObject.import_from_file(conf_path)
    del user_args.unpickle_config

    # set configuration macros
    settings.add_macros(user_args.config_files)

    # set user options
    settings.set_user_opts(user_args)

    try:
        # run Eskapade
        execution.eskapade_run(settings)
    except Exception as exc:
        logger.error('{exc}', exc=exc)
        raise

    # start interpreter if requested (--interactive on command line)
    if settings.get('interactive'):
        # set Pandas display options
        import pandas as pd
        pd.set_option('display.width', 120)
        pd.set_option('display.max_columns', 50)

        # make datastore and config available in interactive session
        ds = process_manager.service(DataStore)
        settings = process_manager.service(ConfigObject)

        # start interactive session
        from code import InteractiveConsole
        cons = InteractiveConsole(locals())
        cons.interact(
            "\nContinuing interactive session ... press Ctrl+d to exit.\n")
    def fork_and_store(self):
        """Fork and then store input collection

        Need to reopen mongo connection after fork
        """
        self.logger.debug("Process id before forking: {}".format(os.getpid()))

        child_pid_list = []

        # submit a new process
        try:
            pid = os.fork()
        except OSError:
            raise OSError("Could not create a child process.")

        if pid == 0:
            self.logger.debug("In child process with PID {}".format(
                os.getpid()))

            # Need to open separate mongo connection after each fork
            settings = process_manager.service(ConfigObject)
            mongo_connection = MongoConnection()
            mongo_connection.set_config_info(settings)
            mdb = mongo_connection.database

            ds = process_manager.service(DataStore)
            docs = ds[self.read_key]

            # store docs
            etl_utils.dostorage(mdb, docs, self.store_collections,
                                self.clearFirst, self.logger, self.read_key)

            # close connection
            mongo_connection.close()

            # safe jupyter exit when forking
            os._exit(os.EX_OK)
        else:
            self.logger.debug(
                "Back in parent process after forking child {}".format(pid))
            child_pid_list.append(pid)

        # can wait for fork to finish, or just go.
        if self.wait_after_fork:
            # check that fork is finished
            while child_pid_list:
                self.logger.debug("Waiting for child process to finish.")
                finished = os.waitpid(0, 0)
                if finished[0] in child_pid_list:
                    self.logger.debug(
                        "Finished child process {} with status {}".format(
                            finished[0], finished[1]))
                    child_pid_list.remove(finished[0])

        self.logger.debug('Finished fork.')
示例#6
0
    def initialize(self):
        """ Initialize SkipChainIfCollectionEmpty """

        mongo_conn = process_manager.service(MongoConnection)
        mongo_conn.set_config_info(process_manager.service(ConfigObject))
        self.mdb = mongo_conn.database

        if self.checkAtInitialize:
            return self.checkCollectionSet()

        return StatusCode.Success
    def execute(self):
        """ Execute MongoCheckCollection """

        process_manager.service(MongoConnection).set_config_info(process_manager.service(ConfigObject))
        self.mdb = process_manager.service(MongoConnection).database

        # check if collection names are in database
        all_colls = self.mdb.collection_names()
        for c in self.collectionSet:
            if c not in all_colls:
                raise Exception("%s is not a collection in the mongo database" % c)
        return StatusCode.Success
示例#8
0
    def initialize(self):
        """ Initialize MongoDeleteManyFromDF """

        process_manager.service(MongoConnection).set_config_info(
            process_manager.service(ConfigObject))
        self.mdb = process_manager.service(MongoConnection).database
        colls = self.mdb.collection_names()
        if self.read_key not in colls:
            raise Exception("%s is not a collection in the mongo database" %
                            self.read_key)

        return StatusCode.Success
    def initialize(self):
        """ Initialize MongoDFToCollection """

        process_manager.service(MongoConnection).set_config_info(
            process_manager.service(ConfigObject))
        self.mdb = process_manager.service(MongoConnection).database

        assert isinstance(self.read_key,
                          str) and len(self.read_key) > 0, 'read key not set.'
        if len(self.store_collections) == 0:
            self.store_collections.append(self.read_key)
        return StatusCode.Success
示例#10
0
    def test_esk106(self):
        settings = process_manager.service(ConfigObject)
        # fake a setting from the cmd-line. picked up in the macro
        settings['do_chain0'] = False

        self.eskapade_run(resources.tutorial('esk106_cmdline_options.py'))

        settings = process_manager.service(ConfigObject)

        self.assertEqual(1, len(process_manager))
        self.assertEqual('Chain1', list(process_manager)[0].name)
        self.assertEqual(False, settings.get('do_chain0', True))
        self.assertEqual(True, settings.get('do_chain1', True))
        self.assertEqual('Universe', list(list(process_manager)[0])[0].hello)
示例#11
0
    def initialize(self):
        """Initialize the link.

        :returns: status code of initialization
        :rtype: StatusCode
        """
        settings = process_manager.service(ConfigObject)
        process_manager.service(MongoConnection).set_config_info(settings)
        self.mdb = process_manager.service(MongoConnection).database

        self.check_arg_types(collection=str, store_key=str)
        self.check_arg_vals('collection', 'store_key')

        return StatusCode.Success
示例#12
0
    def execute(self):
        """ Execute MongoDeleteManyFromDF """

        ds = process_manager.service(DataStore)
        df = ds[self.read_key]
        import bson

        if '_id' not in df.columns:
            raise Exception('No _id column in dataframe %s' % self.read_key)
        elif df._id.dtype != bson.objectid.ObjectId:
            raise Exception(
                '_id column not of the correct type. Type should be bson.objectId.ObjectId'
            )

        try:
            self.mdb[self.read_key].delete_many({'_id': {"$in": list(df._id)}})
        except:
            self.mdb[self.read_key].delete_many(({
                '_id': {
                    "$in": list(df._id)
                }
            }))
            raise Exception('Deletion in collection tobeprocessed failed')

        return StatusCode.Success
    def execute(self):
        """Execute the link.

        :returns: status code of execution
        :rtype: StatusCode
        """
        settings = process_manager.service(ConfigObject)
        ds = process_manager.service(DataStore)

        doc = ds.get(self.read_key, assert_type=(dict, list), assert_len=True)

        self.logger.debug('Started doing storage')
        etl_utils.dostorage(self.mdb, doc, self.store_collections,
                            self.clear_first, self.logger, self.read_key)

        return StatusCode.Success
    def initialize(self):
        """Initialize the link.

        :returns: status code of initialization
        :rtype: StatusCode
        """
        if not self.keys:
            self.logger.warning('No functions to apply')

        # perform basic checks on input keys
        for idx in range(len(self.keys)):
            if isinstance(self.keys[idx], str):
                arr = self.keys[idx]
                arr = dict(key_fs=arr, key_ds=arr, func=unit_func)
                self.keys[idx] = arr
            if not isinstance(self.keys[idx], dict):
                raise AssertionError(
                    'keys attribute is not a list of dict/str.')
            arr = self.keys[idx]
            keys = list(arr.keys())
            if 'key_ds' not in keys:
                raise AssertionError('key input is insufficient.')

        # will count number of times execute has been called.
        fs = process_manager.service(ForkStore)
        fs['n_' + self.name + '_executed'] = 0

        return StatusCode.Success
示例#15
0
    def test_esk101(self):
        self.eskapade_run(resources.tutorial('esk101_helloworld.py'))

        settings = process_manager.service(ConfigObject)

        self.assertTrue(settings['do_hello'])
        self.assertEqual(2, settings['n_repeat'])
示例#16
0
    def execute(self):
        """Execute the link.

        :returns: status code of execution
        :rtype: StatusCode
        """
        ds = process_manager.service(DataStore)

        if self.read_key:
            # fetch and check input
            if isinstance(self.read_key, str):
                obj = ds.get(self.read_key, self.default, self.assert_type,
                             self.assert_len, self.assert_in)
            elif isinstance(self.read_key, list):
                obj = [
                    ds.get(key, self.default, self.assert_type,
                           self.assert_len, self.assert_in)
                    for key in self.read_key
                ]
            # apply function
            trans_obj = self.func(obj, *self.args, **self.kwargs)
        else:
            # possibly function does not require object as input
            trans_obj = self.func(*self.args, **self.kwargs)

        if self.store_key:
            ds[self.store_key] = trans_obj

        return StatusCode.Success
示例#17
0
    def test_esk105a(self):
        self.eskapade_run(resources.tutorial('esk105_A_dont_store_results.py'))

        settings = process_manager.service(ConfigObject)
        path = settings['resultsDir'] + '/' + settings['analysisName']

        self.assertFalse(os.path.exists(path))
示例#18
0
def mongo_reset_collections():

    import argparse

    from escore import process_manager, ConfigObject
    from escore.core import persistence
    from mongodbtools import resources
    from mongodbtools import MongoConnection

    parser = argparse.ArgumentParser(
        'eskapade_mongo_reset_collections',
        description='Clean MongoDB collections.',
        epilog=
        'Please note, only the collections \'proxy\' and \'roles\' will be kept.'
    )
    parser.add_argument('--config',
                        '-c',
                        nargs='?',
                        help='Path to custom MongoDB configuration file.')
    args = parser.parse_args()

    path = resources.config('mongo.cfg')
    if args.config and os.path.exists(args.config):
        path = args.config
    logger.info('Using MongoDB configuration from {path:s}'.format(path=path))

    settings = process_manager.service(ConfigObject)
    settings['analysisName'] = 'mongo_reset_collections'
    settings['version'] = 0

    settings['mongodb'] = resources.config('mongo.cfg')

    # Assert if the results are written to mongo
    process_manager.service(MongoConnection).set_config_info(settings)
    mdb = process_manager.service(MongoConnection).database

    # --- these collections you should keep (authorization data/proxy service)
    col_keep = ['proxy', 'roles']

    col_names = mdb.collection_names(False)
    for name in col_names:
        if mdb[name].name not in col_keep:
            mdb[name].remove({})
            mdb.drop_collection(name)

    logger.info('Cleared all MongoDB collections (except {cols})'.format(
        cols=col_keep))
示例#19
0
    def configure_mongo(self, lock: bool = False) -> None:
        """Configure mongo used during exectute

        This is the final part of initialization, and needs to be redone in case of
        forked processing. Hence this function is split off into a separate function. 
        The function can be locked once the configuration is final.

        :param bool lock: if True, lock this part of the configuration
        """
        if self.config_lock:
            return
        self.config_lock = lock

        settings = process_manager.service(ConfigObject)
        if settings.get('fork', False):  # during fork
            # Need to open separate mongo connection after each fork
            self.mongo_connection = MongoConnection()
            self.mongo_connection.set_config_info(settings)
            self.mdb = self.mongo_connection.database
            # set length of cursor of this fork
            fidx = settings['fork_index']
            self.skip = self.n_chunks_in_fork * fidx * self.chunk_size
            self.limit = self.n_chunks_in_fork * self.chunk_size
        else:  # default (no fork)
            process_manager.service(MongoConnection).set_config_info(settings)
            self.mdb = process_manager.service(MongoConnection).database

        # check if collection names are in database
        colls = self.mdb.collection_names()
        if self.collection not in colls:
            self.logger.warning(
                'Source collection <%s> does not exist in mongo db.' %
                self.collection)
            raise NameError
        try:
            kwargs = dict()
            kwargs['filter'] = self.query
            kwargs['projection'] = self.use_cols
            if self.skip: kwargs['skip'] = self.skip
            if self.limit: kwargs['limit'] = self.limit
            cursor = self.mdb[self.collection].find(**kwargs)
            self._reader = MongoCursorReader(cursor, self.chunk_size)
        except:
            self.logger.critical(
                'Could not get cursor to source collection <%s> from mongo db.'
                % self.collection)
            raise BufferError
示例#20
0
    def test_esk104(self):
        self.eskapade_run(
            resources.tutorial('esk104_basic_datastore_operations.py'))

        ds = process_manager.service(DataStore)

        self.assertEqual(1, len(ds))
        self.assertEqual(1, ds['a'])
示例#21
0
    def test_esk107(self):
        self.eskapade_run(resources.tutorial('esk107_chain_looper.py'))

        ds = process_manager.service(DataStore)

        # chain is repeated 10 times, with nothing put in datastore
        self.assertEqual(0, len(ds))
        self.assertEqual(10, list(list(process_manager)[0])[1].maxcount)
    def finalize(self):
        """Finalize the link.

        :returns: status code of finalization
        :rtype: StatusCode
        """
        ds = process_manager.service(DataStore)
        fs = process_manager.service(ForkStore)

        # check if nothing to do
        if fs.get('n_' + self.name + '_executed', 0) == 0:
            return StatusCode.Success
        # check number of times forkdatacollector has run
        if fs.get('n_fork', 0) > 0 and (
                fs['n_' + self.name + '_executed'] % fs['n_fork'] > 0):
            self.logger.warning(
                'Did not execute multiple of n_fork {0} times: {1}. Data may be missing.'
                .format(fs['n_fork'], fs['n_' + self.name + '_executed']))

        # putting (transformed) objects from forkstore back into datastore
        for arr in self.keys:
            keys = list(arr.keys())
            key_ds = arr['key_ds']
            key_fs = key_ds if 'key_fs' not in keys else arr['key_fs']
            if key_fs not in fs:
                raise AssertionError('key {} not in forkstore.'.format(key_fs))
            # retrieve function to apply
            func = unit_func if 'func' not in keys else arr['func']
            args = () if 'args' not in keys else arr['args']
            kwargs = {} if 'kwargs' not in keys else arr['kwargs']
            # apply transformation
            self.logger.debug('Applying function {function!s}.', function=func)
            obj = fs[key_fs]
            try:
                trans_obj = func(obj, *args, **kwargs)
            except:
                raise Exception(
                    'Failed to apply function {function!s} to object with {key}.',
                    function=func,
                    key=key_ds)
            # put transformed ojbect back in datastore
            ds[key_ds] = trans_obj

        fs.Print()

        return StatusCode.Success
    def initialize(self):
        """Initialize the link.

        :returns: status code of initialization
        :rtype: StatusCode
        """
        self.check_arg_types(read_key=str, store_collections=list)
        self.check_arg_types(recurse=True,
                             allow_none=True,
                             store_collections=str)
        self.check_arg_vals('read_key')

        settings = process_manager.service(ConfigObject)
        process_manager.service(MongoConnection).set_config_info(settings)
        self.mdb = process_manager.service(MongoConnection).database

        return StatusCode.Success
示例#24
0
    def setUp(self):
        """Set up test"""

        execution.reset_eskapade()
        settings = process_manager.service(ConfigObject)
        settings['analysisName'] = self.__class__.__name__
        settings['logLevel'] = LogLevel.DEBUG
        settings['batchMode'] = True
示例#25
0
    def execute(self):
        """Execute the link."""
        ds = process_manager.service(DataStore)

        for key in self.keySet:
            assert key in ds, 'Key {} not in DataStore.'.format(key)

        return StatusCode.Success
示例#26
0
    def test_esk102(self):
        self.eskapade_run(resources.tutorial('esk102_multiple_chains.py'))

        settings = process_manager.service(ConfigObject)

        self.assertTrue(settings['do_chain0'])
        self.assertTrue(settings['do_chain1'])
        self.assertTrue(settings['do_chain2'])
        self.assertEqual(3, len(process_manager))
示例#27
0
    def test_esk106_script(self, mock_argv):
        """Test Eskapade run with esk106 macro from script"""

        # get file paths
        settings = process_manager.service(ConfigObject)
        settings['analysisName'] = 'esk106_cmdline_options'
        settings_ = settings.copy()
        macro_path = resources.tutorial('esk106_cmdline_options.py')

        # mock command-line arguments
        args = []
        mock_argv.__getitem__ = lambda s, k: args.__getitem__(k)

        # base settings
        args_ = [macro_path, '-LDEBUG', '--batch-mode']
        settings_['macro'] = macro_path
        settings_['logLevel'] = LogLevel.DEBUG
        settings_['batchMode'] = True

        def do_run(name, args, args_, settings_, add_args, add_settings,
                   chains):
            # set arguments
            args.clear()
            args += args_ + add_args
            settings = settings_.copy()
            settings.update(add_settings)

            # run Eskapade
            process_manager.reset()
            entry_points.eskapade_run()
            settings_run = process_manager.service(ConfigObject)

            # check results
            self.assertListEqual(
                [c.name for c in process_manager.chains], chains,
                'unexpected chain names in "{}" test'.format(name))
            self.assertDictEqual(
                settings_run, settings,
                'unexpected settings in "{}" test'.format(name))

        # run both chains
        do_run(
            'both chains', args, args_, settings_,
            ['--store-all', '-cdo_chain0=True', '-cdo_chain1=True'],
            dict(storeResultsEachChain=True, do_chain0=True,
                 do_chain1=True), ['Chain0', 'Chain1'])

        # run only last chain by skipping the first
        do_run('skip first', args, args_, settings_,
               ['-bChain1', '-cdo_chain0=True', '-cdo_chain1=True'],
               dict(beginWithChain='Chain1', do_chain0=True,
                    do_chain1=True), ['Chain0', 'Chain1'])

        # run only last chain by not defining the first
        do_run('no first', args, args_, settings_,
               ['-cdo_chain0=False', '-cdo_chain1=True'],
               dict(do_chain0=False, do_chain1=True), ['Chain1'])
示例#28
0
    def test_esk109(self):
        settings = process_manager.service(ConfigObject)
        # this flag turns off python embed link
        settings['TESTING'] = True

        self.eskapade_run(resources.tutorial('esk109_debugging_tips.py'),
                          StatusCode.Failure)

        self.assertTrue(isinstance(list(list(process_manager)[0])[3], Break))
示例#29
0
    def import_and_update_datastore(self):
        """Import and update the datastore
        """
        # loading external datastore
        ext_store = DataStore.import_from_file(self.path)
        if not isinstance(ext_store, DataStore):
            self.logger.fatal('Object in file "{path}" not of type DataStore.',
                              path=self.path)
            raise AssertionError('Input object not of type DataStore.')

        if self.update:
            # update existing datastore
            ds = process_manager.service(DataStore)
            ds.update(ext_store)
        else:  # default
            # replace existing datastore
            process_manager.remove_service(DataStore)
            process_manager.service(ext_store)
示例#30
0
    def test_esk103(self):
        self.eskapade_run(resources.tutorial('esk103_printdatastore.py'))

        ds = process_manager.service(DataStore)

        self.assertEqual('world', ds['hello'])
        self.assertEqual(1, ds['d']['a'])
        self.assertEqual(2, ds['d']['b'])
        self.assertEqual(3, ds['d']['c'])