Пример #1
0
def test_tmgr_rp_tmgr():

    os.environ['RADICAL_PILOT_DBURL'] = MLAB
    os.environ['ENTK_HB_INTERVAL'] = '30'

    res_dict = {
        'resource': 'local.localhost',
        'walltime': 40,
        'cpus': 20,
    }
    config = {"sandbox_cleanup": False, "db_cleanup": False}
    rmgr_id = ru.generate_id('test.%(item_counter)04d', ru.ID_CUSTOM)
    rmgr = RPRmgr(resource_desc=res_dict, sid=rmgr_id, rts_config=config)
    rmgr._validate_resource_desc()
    rmgr._populate()
    rmgr._submit_resource_request()

    tmgr = RPTmgr(sid=rmgr_id,
                  pending_queue=['pendingq-1'],
                  completed_queue=['completedq-1'],
                  rmgr=rmgr,
                  mq_hostname=hostname,
                  port=port)

    tmgr.start_manager()

    proc = Process(target=func_for_mock_tmgr_test,
                   args=(hostname, port, tmgr._pending_queue[0],
                         tmgr._completed_queue[0]))
    proc.start()

    proc.join()
    tmgr.terminate_manager()
    rmgr._terminate_resource_request()
Пример #2
0
def test_tmgr_rp_tmgr():

    res_dict = {'resource': 'local.localhost', 'walltime': 40, 'cpus': 20}
    config = {"sandbox_cleanup": False, "db_cleanup": False}
    rmgr_id = ru.generate_id('test', ru.ID_UNIQUE)
    rmgr = RPRmgr(resource_desc=res_dict, sid=rmgr_id, rts_config=config)
    rmq_conn_params = pika.ConnectionParameters(host=hostname, port=port)

    rmgr._validate_resource_desc()
    rmgr._populate()
    rmgr._submit_resource_request()

    tmgr = RPTmgr(sid=rmgr_id,
                  pending_queue=['pendingq-1'],
                  completed_queue=['completedq-1'],
                  rmgr=rmgr,
                  rmq_conn_params=rmq_conn_params)

    tmgr.start_manager()

    proc = mp.Process(target=func_for_mock_tmgr_test,
                      args=(hostname, port, tmgr._pending_queue[0],
                            tmgr._completed_queue[0]))
    proc.start()
    proc.join()

    tmgr.terminate_manager()
    rmgr._terminate_resource_request()
def test_tmgr_rp_tmgr():

    os.environ['RADICAL_PILOT_DBURL'] = MLAB
    os.environ['ENTK_HB_INTERVAL'] = '30'

    res_dict = {
        'resource': 'local.localhost',
                    'walltime': 40,
                    'cpus': 20,
    }
    config={ "sandbox_cleanup": False,"db_cleanup": False}
    rmgr_id = ru.generate_id('test.%(item_counter)04d', ru.ID_CUSTOM)
    rmgr = RPRmgr(resource_desc=res_dict, sid=rmgr_id, rts_config=config)
    rmgr._validate_resource_desc()
    rmgr._populate()
    rmgr._submit_resource_request()

    tmgr = RPTmgr(sid=rmgr_id,
                     pending_queue=['pendingq-1'],
                     completed_queue=['completedq-1'],
                     rmgr=rmgr,
                     mq_hostname=hostname,
                     port=port)

    tmgr.start_manager()

    proc = Process(target=func_for_mock_tmgr_test, args=(hostname,
                                                          port,
                                                          tmgr._pending_queue[0],
                                                          tmgr._completed_queue[0]))
    proc.start()

    proc.join()
    tmgr.terminate_manager()
    rmgr._terminate_resource_request()
Пример #4
0
def test_write_session_description():

    amgr = AppManager(hostname=hostname, port=port)
    amgr.resource_desc = {'resource' : 'xsede.stampede',
                          'walltime' : 59,
                          'cpus'     : 128,
                          'gpus'     : 64,
                          'project'  : 'xyz',
                          'queue'    : 'high'}

    workflow      = [generate_pipeline(1), generate_pipeline(2)]
    amgr.workflow = workflow

    amgr._wfp = WFprocessor(sid=amgr.sid,
                            workflow=amgr._workflow,
                            pending_queue=amgr._pending_queue,
                            completed_queue=amgr._completed_queue,
                            resubmit_failed=amgr._resubmit_failed,
                            rmq_conn_params=amgr._rmq_conn_params)
    amgr._workflow = amgr._wfp.workflow

    amgr._task_manager = TaskManager(sid=amgr._sid,
                                     pending_queue=amgr._pending_queue,
                                     completed_queue=amgr._completed_queue,
                                     rmgr=amgr._rmgr,
                                     rmq_conn_params=amgr._rmq_conn_params
                                     )

    write_session_description(amgr)

    desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid))
    src  = '%s/sample_data' % pwd

    assert desc == ru.read_json('%s/expected_desc_write_session.json' % src)
Пример #5
0
def test_amgr_resource_terminate():

    res_dict = {

        'resource': 'xsede.supermic',
        'walltime': 30,
        'cpus': 20,
        'project': 'TG-MCB090174'

    }

    from radical.entk.execman.rp import TaskManager

    amgr = Amgr(rts='radical.pilot', hostname=hostname, port=port)
    amgr.resource_desc = res_dict
    amgr._setup_mqs()
    amgr._rmq_cleanup = True
    amgr._task_manager = TaskManager(sid='test',
                                     pending_queue=list(),
                                     completed_queue=list(),
                                     mq_hostname=amgr._mq_hostname,
                                     rmgr=amgr._resource_manager,
                                     port=amgr._port
                                     )

    amgr.resource_terminate()
Пример #6
0
def test_write_session_description():

    amgr = AppManager(hostname=hostname,
                      port=port,
                      username=username,
                      password=password)
    amgr.resource_desc = {
        'resource': 'xsede.stampede',
        'walltime': 59,
        'cpus': 128,
        'gpus': 64,
        'project': 'xyz',
        'queue': 'high'
    }

    workflow = [generate_pipeline(1), generate_pipeline(2)]
    amgr.workflow = workflow

    amgr._wfp = WFprocessor(sid=amgr.sid,
                            workflow=amgr._workflow,
                            pending_queue=amgr._pending_queue,
                            completed_queue=amgr._completed_queue,
                            resubmit_failed=amgr._resubmit_failed,
                            rmq_conn_params=amgr._rmq_conn_params)
    amgr._workflow = amgr._wfp.workflow

    amgr._task_manager = TaskManager(sid=amgr._sid,
                                     pending_queue=amgr._pending_queue,
                                     completed_queue=amgr._completed_queue,
                                     rmgr=amgr._rmgr,
                                     rmq_conn_params=amgr._rmq_conn_params)

    write_session_description(amgr)

    desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid))
    # tasks are originally set but saved as a list in json
    # uses sorting for convenient comparison, this doesn't change validity
    for k, v in (desc['tree'].items()):
        if k.startswith("stage"):
            desc['tree'][k]['children'] = sorted(v['children'])

    src = '%s/sample_data' % pwd

    assert desc == ru.read_json('%s/expected_desc_write_session.json' % src)
Пример #7
0
def test_write_session_description():

    hostname = os.environ.get('RMQ_HOSTNAME', 'localhost')
    port = int(os.environ.get('RMQ_PORT', 5672))
    amgr = AppManager(hostname=hostname, port=port)
    amgr.resource_desc = {
        'resource': 'xsede.stampede',
        'walltime': 60,
        'cpus': 128,
        'gpus': 64,
        'project': 'xyz',
        'queue': 'high'
    }

    workflow = [generate_pipeline(1), generate_pipeline(2)]
    amgr.workflow = workflow

    amgr._wfp = WFprocessor(sid=amgr._sid,
                            workflow=amgr._workflow,
                            pending_queue=amgr._pending_queue,
                            completed_queue=amgr._completed_queue,
                            mq_hostname=amgr._mq_hostname,
                            port=amgr._port,
                            resubmit_failed=amgr._resubmit_failed)
    amgr._wfp._initialize_workflow()
    amgr._workflow = amgr._wfp.workflow

    amgr._task_manager = TaskManager(sid=amgr._sid,
                                     pending_queue=amgr._pending_queue,
                                     completed_queue=amgr._completed_queue,
                                     mq_hostname=amgr._mq_hostname,
                                     rmgr=amgr._resource_manager,
                                     port=amgr._port
                                     )

    # os.mkdir(amgr._sid)

    write_session_description(amgr)

    desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid))
    curdir = os.path.dirname(os.path.abspath(__file__))
    src = '%s/sample_data' % curdir
    assert desc == ru.read_json('%s/expected_desc_write_session.json' % src)
    def run(self):
        """
        **Purpose**: Run the application manager. Once the workflow and resource manager have been assigned. Invoking this
        method will start the setting up the communication infrastructure, submitting a resource request and then
        submission of all the tasks.
        """

        try:

            # Set None objects local to each run
            self._wfp = None
            self._sync_thread = None
            self._terminate_sync = Event()
            self._resubmit_failed = False
            self._cur_attempt = 1

            if not self._workflow:
                self._logger.error('No workflow assigned currently, please check your script')
                raise MissingError(obj=self._uid, missing_attribute='workflow')

            if not self._resource_manager:
                self._logger.error('No resource manager assigned currently, please create and add a valid resource manager')
                raise MissingError(obj=self._uid, missing_attribute='resource_manager')

            self._prof.prof('amgr run started', uid=self._uid)

            # Setup rabbitmq stuff
            if not self._mqs_setup:

                self._report.info('Setting up RabbitMQ system')
                setup = self._setup_mqs()

                if not setup:
                    self._logger.error('RabbitMQ system not available')
                    raise EnTKError("RabbitMQ setup failed")

                self._mqs_setup = True

                self._report.ok('>>ok\n')

            # Create WFProcessor object
            self._prof.prof('creating wfp obj', uid=self._uid)
            self._wfp = WFprocessor(sid=self._sid,
                                    workflow=self._workflow,
                                    pending_queue=self._pending_queue,
                                    completed_queue=self._completed_queue,
                                    mq_hostname=self._mq_hostname,
                                    port=self._port,
                                    resubmit_failed=self._resubmit_failed)
            self._wfp._initialize_workflow()
            self._workflow = self._wfp.workflow


            # Submit resource request if not resource allocation done till now or
            # resubmit a new one if the old one has completed
            if self._resource_manager:
                res_alloc_state = self._resource_manager.get_resource_allocation_state()
                if (not res_alloc_state) or (res_alloc_state in self._resource_manager.get_completed_states()):

                    self._logger.info('Starting resource request submission')
                    self._prof.prof('init rreq submission', uid=self._uid)
                    self._resource_manager._submit_resource_request()
                    res_alloc_state = self._resource_manager.get_resource_allocation_state()
                    if res_alloc_state in self._resource_manager.get_completed_states():
                        raise EnTKError(msg="Cannot proceed. Resource allocation ended up in %s"%res_alloc_state)

            else:

                self._logger.exception('Cannot run without resource manager, please create and assign a resource manager')
                raise EnTKError(text='Missing resource manager')

            # Start synchronizer thread
            if not self._sync_thread:
                self._logger.info('Starting synchronizer thread')
                self._sync_thread = Thread(target=self._synchronizer, name='synchronizer-thread')
                self._prof.prof('starting synchronizer thread', uid=self._uid)
                self._sync_thread.start()

            # Start WFprocessor
            self._logger.info('Starting WFProcessor process from AppManager')
            self._wfp.start_processor()

            self._report.ok('All components created\n')

            # Create tmgr object only if it does not already exist
            if self._rts == 'radical.pilot':
                from radical.entk.execman.rp import TaskManager
            elif self._rts == 'mock':
                from radical.entk.execman.mock import TaskManager

            if not self._task_manager:
                self._prof.prof('creating tmgr obj', uid=self._uid)
                self._task_manager = TaskManager(sid=self._sid,
                                                 pending_queue=self._pending_queue,
                                                 completed_queue=self._completed_queue,
                                                 mq_hostname=self._mq_hostname,
                                                 rmgr=self._resource_manager,
                                                 port=self._port
                                                 )
                self._logger.info('Starting task manager process from AppManager')
                self._task_manager.start_manager()
                self._task_manager.start_heartbeat()

            active_pipe_count = len(self._workflow)
            finished_pipe_uids = []

            # We wait till all pipelines of the workflow are marked
            # complete
            while ((active_pipe_count > 0) and
                    (self._wfp.workflow_incomplete()) and
                    (self._resource_manager.get_resource_allocation_state() not
                     in self._resource_manager.get_completed_states())):

                if active_pipe_count > 0:

                    for pipe in self._workflow:

                        with pipe.lock:

                            if (pipe.completed) and (pipe.uid not in finished_pipe_uids):

                                self._logger.info('Pipe %s completed' % pipe.uid)
                                finished_pipe_uids.append(pipe.uid)
                                active_pipe_count -= 1
                                self._logger.info('Active pipes: %s' % active_pipe_count)

                if (not self._sync_thread.is_alive()) and (self._cur_attempt <= self._reattempts):

                    self._sync_thread = Thread(target=self._synchronizer,
                                               name='synchronizer-thread')
                    self._logger.info('Restarting synchronizer thread')
                    self._prof.prof('restarting synchronizer', uid=self._uid)
                    self._sync_thread.start()

                    self._cur_attempt += 1

                if (not self._wfp.check_processor()) and (self._cur_attempt <= self._reattempts):

                    """
                    If WFP dies, both child threads are also cleaned out.
                    We simply recreate the wfp object with a copy of the workflow
                    in the appmanager and start the processor.
                    """

                    self._prof.prof('recreating wfp obj', uid=self._uid)
                    self._wfp = WFProcessor(
                        sid=self._sid,
                        workflow=self._workflow,
                        pending_queue=self._pending_queue,
                        completed_queue=self._completed_queue,
                        mq_hostname=self._mq_hostname,
                        port=self._port,
                        resubmit_failed=self._resubmit_failed)

                    self._logger.info('Restarting WFProcessor process from AppManager')
                    self._wfp.start_processor()

                    self._cur_attempt += 1

                if (not self._task_manager.check_heartbeat()) and (self._cur_attempt <= self._reattempts):

                    """
                    If the tmgr process or heartbeat dies, we simply start a
                    new process using the start_manager method. We do not
                    need to create a new instance of the TaskManager object
                    itself. We stop and start a new instance of the
                    heartbeat thread as well.
                    """
                    self._prof.prof('restarting tmgr process and heartbeat', uid=self._uid)

                    self._logger.info('Terminating heartbeat thread')
                    self._task_manager.terminate_heartbeat()
                    self._logger.info('Terminating tmgr process')
                    self._task_manager.terminate_manager()
                    self._logger.info('Restarting task manager process')
                    self._task_manager.start_manager()
                    self._logger.info('Restarting heartbeat thread')
                    self._task_manager.start_heartbeat()

                    self._cur_attempt += 1

            self._prof.prof('start termination', uid=self._uid)

            # Terminate threads in following order: wfp, helper, synchronizer
            self._logger.info('Terminating WFprocessor')
            self._wfp.terminate_processor()

            self._logger.info('Terminating synchronizer thread')
            self._terminate_sync.set()
            self._sync_thread.join()
            self._logger.info('Synchronizer thread terminated')

            if self._autoterminate:
                self._terminate()

            if self._write_workflow:
                write_workflow(self._workflow, self._sid)

            self._prof.prof('termination done', uid=self._uid)

        except KeyboardInterrupt:

            self._prof.prof('start termination', uid=self._uid)

            self._logger.exception('Execution interrupted by user (you probably hit Ctrl+C), ' +
                               'trying to cancel enqueuer thread gracefully...')

            self._terminate()

            self._prof.prof('termination done', uid=self._uid)

            raise KeyboardInterrupt

        except Exception, ex:

            self._prof.prof('start termination', uid=self._uid)

            self._logger.exception('Error in AppManager: %s' % ex)

            # Terminate threads in following order: wfp, helper, synchronizer
            
            self._terminate()

            self._prof.prof('termination done', uid=self._uid)
            raise
class AppManager(object):

    """
    An application manager takes the responsibility of setting up the communication infrastructure, instantiates the
    ResourceManager, TaskManager, WFProcessor objects and all their threads and processes. This is the Master object
    running in the main process and is designed to recover from errors from all other objects, threads and processes.

    :Arguments:
        :config_path: Url to config path to be read for AppManager
        :hostname: host rabbitmq server is running
        :port: port at which rabbitmq can be accessed
        :reattempts: number of attempts to re-invoke any failed EnTK components
        :resubmit_failed: resubmit failed tasks (True/False)
        :autoterminate: terminate resource reservation upon execution of all tasks of first workflow (True/False)
        :write_workflow: write workflow and mapping to rts entities to a file (post-termination)
        :rts: Specify RTS to use. Current options: 'mock', 'radical.pilot' (default if unspecified)
        :rmq_cleanup: Cleanup all queues created in RabbitMQ server for current execution (default is True)
        :rts_config: Configuration for the RTS, accepts {"sandbox_cleanup": True/False,"db_cleanup": True/False} when RTS is RP
        :name: Name of the Application. It should be unique between executions. (default is randomly assigned)
    """

    def __init__(self,
                 config_path=None,
                 hostname=None,
                 port=None,
                 reattempts=None,
                 resubmit_failed=None,
                 autoterminate=None,
                 write_workflow=None,
                 rts=None,
                 rmq_cleanup=None,
                 rts_config=None,
                 name=None):

        # Create a session for each EnTK script execution
        if name:
            self._name = name
            self._sid = name
        else:
            self._name= str()
            self._sid = ru.generate_id('re.session', ru.ID_PRIVATE)

        self._read_config(config_path, hostname, port, reattempts,
                          resubmit_failed, autoterminate, write_workflow,
                          rts, rmq_cleanup, rts_config)

        # Create an uid + logger + profiles for AppManager, under the sid
        # namespace
        path = os.getcwd() + '/' + self._sid
        self._uid = ru.generate_id('appmanager.%(item_counter)04d', ru.ID_CUSTOM, namespace=self._sid)
        self._logger = ru.Logger('radical.entk.%s' % self._uid, path=path)
        self._prof = ru.Profiler(name='radical.entk.%s' % self._uid, path=path)
        self._report = ru.Reporter(name='radical.entk.%s' % self._uid)

        self._report.info('EnTK session: %s\n' % self._sid)
        self._prof.prof('create amgr obj', uid=self._uid)
        self._report.info('Creating AppManager')

        self._resource_manager = None
        # RabbitMQ Queues
        self._pending_queue = list()
        self._completed_queue = list()

        # Global parameters to have default values
        self._mqs_setup = False
        self._resource_desc = None
        self._task_manager = None
        self._workflow = None
        self._cur_attempt = 1
        self._shared_data = list()
        self._wfp = None
        self._sync_thread = None

        self._rmq_ping_interval = os.getenv('RMQ_PING_INTERVAL', 10)

        self._logger.info('Application Manager initialized')
        self._prof.prof('amgr obj created', uid=self._uid)
        self._report.ok('>>ok\n')

    def _read_config(self, config_path, hostname, port, reattempts,
                     resubmit_failed, autoterminate, write_workflow,
                     rts, rmq_cleanup, rts_config):

        if not config_path:
            config_path = os.path.dirname(os.path.abspath(__file__))

        config = ru.read_json(os.path.join(config_path, 'config.json'))

        self._mq_hostname = hostname if hostname else str(config['hostname'])
        self._port = int(port if port else config['port'])
        self._reattempts = reattempts if reattempts else config['reattempts']
        self._resubmit_failed = resubmit_failed if resubmit_failed is not None else config['resubmit_failed']
        self._autoterminate = autoterminate if autoterminate is not None else config['autoterminate']
        self._write_workflow = write_workflow if write_workflow is not None else config['write_workflow']
        self._rts = rts if rts in ['radical.pilot', 'mock'] else str(config['rts'])
        self._rmq_cleanup = rmq_cleanup if rmq_cleanup is not None else config['rmq_cleanup']
        self._rts_config = rts_config if rts_config is not None else config['rts_config']

        self._num_pending_qs = config['pending_qs']
        self._num_completed_qs = config['completed_qs']

    # ------------------------------------------------------------------------------------------------------------------
    # Getter functions
    # ------------------------------------------------------------------------------------------------------------------

    @property
    def name(self):
        """
        Name for the application manager. Allows the user to setup the name of
        the application manager, as well as, its session ID. This name should be
        unique between different EnTK executions, otherwise it will produce an
        error.

        :getter: Returns the name of the application manager
        :setter: Assigns the name of the application manager
        :type: String
        """

        return self._name

    @property
    def sid(self):
        """
        Get the session ID of the current EnTK execution

        :getter: Returns the session ID of the EnTK execution
        :type: String
        """

        return self._sid

    @property
    def resource_desc(self):
        """
        :getter: Returns the resource description
        :setter: Assigns a resource description
        """

        return self._resource_desc

    @property
    def workflow(self):
        """
        :getter: Return the workflow assigned for execution
        :setter: Assign workflow to be executed
        """

        return self._workflow

    @property
    def shared_data(self):
        """
        :getter: Return list of filenames that are shared between multiple tasks of the application
        :setter: Assign a list of names of files that need to be staged to the remote machine
        """

        return self._shared_data

    # ------------------------------------------------------------------------------------------------------------------
    # Setter functions
    # ------------------------------------------------------------------------------------------------------------------

    @name.setter
    def name(self, value):

        if not isinstance(value, str):
            raise TypeError(expected_type=str, actual_type=type(value))

        else:
            self._name = value

    @resource_desc.setter
    def resource_desc(self, value):

        if self._rts == 'radical.pilot':
            from radical.entk.execman.rp import ResourceManager
            self._resource_manager = ResourceManager(resource_desc=value,
                                                     sid=self._sid,
                                                     rts_config=self._rts_config)
        elif self._rts == 'mock':
            from radical.entk.execman.mock import ResourceManager
            self._resource_manager = ResourceManager(resource_desc=value,
                                                     sid=self._sid)

        self._report.info('Validating and assigning resource manager')

        if self._resource_manager._validate_resource_desc():
            self._resource_manager._populate()
            self._resource_manager.shared_data = self._shared_data
        else:
            self._logger.error('Could not validate resource description')
            raise
        self._report.ok('>>ok\n')

    @workflow.setter
    def workflow(self, workflow):

        self._prof.prof('assigning workflow', uid=self._uid)

        for p in workflow:
            if not isinstance(p, Pipeline):
                self._logger.info('workflow type incorrect')
                raise TypeError(expected_type=['Pipeline', 'set of Pipelines'], actual_type=type(p))

            p._validate()

        self._workflow = workflow
        self._logger.info('Workflow assigned to Application Manager')

    @shared_data.setter
    def shared_data(self, data):

        if not isinstance(data, list):
            data = [data]

        for value in data:
            if not isinstance(value, str):
                raise TypeError(expected_type=str, actual_type=type(value))

        if self._resource_manager:
            self._resource_manager.shared_data = data


    # ------------------------------------------------------------------------------------------------------------------
    # Public methods
    # ------------------------------------------------------------------------------------------------------------------

    def run(self):
        """
        **Purpose**: Run the application manager. Once the workflow and resource manager have been assigned. Invoking this
        method will start the setting up the communication infrastructure, submitting a resource request and then
        submission of all the tasks.
        """

        try:

            # Set None objects local to each run
            self._wfp = None
            self._sync_thread = None
            self._terminate_sync = Event()
            self._resubmit_failed = False
            self._cur_attempt = 1

            if not self._workflow:
                self._logger.error('No workflow assigned currently, please check your script')
                raise MissingError(obj=self._uid, missing_attribute='workflow')

            if not self._resource_manager:
                self._logger.error('No resource manager assigned currently, please create and add a valid resource manager')
                raise MissingError(obj=self._uid, missing_attribute='resource_manager')

            self._prof.prof('amgr run started', uid=self._uid)

            # Setup rabbitmq stuff
            if not self._mqs_setup:

                self._report.info('Setting up RabbitMQ system')
                setup = self._setup_mqs()

                if not setup:
                    self._logger.error('RabbitMQ system not available')
                    raise EnTKError("RabbitMQ setup failed")

                self._mqs_setup = True

                self._report.ok('>>ok\n')

            # Create WFProcessor object
            self._prof.prof('creating wfp obj', uid=self._uid)
            self._wfp = WFprocessor(sid=self._sid,
                                    workflow=self._workflow,
                                    pending_queue=self._pending_queue,
                                    completed_queue=self._completed_queue,
                                    mq_hostname=self._mq_hostname,
                                    port=self._port,
                                    resubmit_failed=self._resubmit_failed)
            self._wfp._initialize_workflow()
            self._workflow = self._wfp.workflow


            # Submit resource request if not resource allocation done till now or
            # resubmit a new one if the old one has completed
            if self._resource_manager:
                res_alloc_state = self._resource_manager.get_resource_allocation_state()
                if (not res_alloc_state) or (res_alloc_state in self._resource_manager.get_completed_states()):

                    self._logger.info('Starting resource request submission')
                    self._prof.prof('init rreq submission', uid=self._uid)
                    self._resource_manager._submit_resource_request()
                    res_alloc_state = self._resource_manager.get_resource_allocation_state()
                    if res_alloc_state in self._resource_manager.get_completed_states():
                        raise EnTKError(msg="Cannot proceed. Resource allocation ended up in %s"%res_alloc_state)

            else:

                self._logger.exception('Cannot run without resource manager, please create and assign a resource manager')
                raise EnTKError(text='Missing resource manager')

            # Start synchronizer thread
            if not self._sync_thread:
                self._logger.info('Starting synchronizer thread')
                self._sync_thread = Thread(target=self._synchronizer, name='synchronizer-thread')
                self._prof.prof('starting synchronizer thread', uid=self._uid)
                self._sync_thread.start()

            # Start WFprocessor
            self._logger.info('Starting WFProcessor process from AppManager')
            self._wfp.start_processor()

            self._report.ok('All components created\n')

            # Create tmgr object only if it does not already exist
            if self._rts == 'radical.pilot':
                from radical.entk.execman.rp import TaskManager
            elif self._rts == 'mock':
                from radical.entk.execman.mock import TaskManager

            if not self._task_manager:
                self._prof.prof('creating tmgr obj', uid=self._uid)
                self._task_manager = TaskManager(sid=self._sid,
                                                 pending_queue=self._pending_queue,
                                                 completed_queue=self._completed_queue,
                                                 mq_hostname=self._mq_hostname,
                                                 rmgr=self._resource_manager,
                                                 port=self._port
                                                 )
                self._logger.info('Starting task manager process from AppManager')
                self._task_manager.start_manager()
                self._task_manager.start_heartbeat()

            active_pipe_count = len(self._workflow)
            finished_pipe_uids = []

            # We wait till all pipelines of the workflow are marked
            # complete
            while ((active_pipe_count > 0) and
                    (self._wfp.workflow_incomplete()) and
                    (self._resource_manager.get_resource_allocation_state() not
                     in self._resource_manager.get_completed_states())):

                if active_pipe_count > 0:

                    for pipe in self._workflow:

                        with pipe.lock:

                            if (pipe.completed) and (pipe.uid not in finished_pipe_uids):

                                self._logger.info('Pipe %s completed' % pipe.uid)
                                finished_pipe_uids.append(pipe.uid)
                                active_pipe_count -= 1
                                self._logger.info('Active pipes: %s' % active_pipe_count)

                if (not self._sync_thread.is_alive()) and (self._cur_attempt <= self._reattempts):

                    self._sync_thread = Thread(target=self._synchronizer,
                                               name='synchronizer-thread')
                    self._logger.info('Restarting synchronizer thread')
                    self._prof.prof('restarting synchronizer', uid=self._uid)
                    self._sync_thread.start()

                    self._cur_attempt += 1

                if (not self._wfp.check_processor()) and (self._cur_attempt <= self._reattempts):

                    """
                    If WFP dies, both child threads are also cleaned out.
                    We simply recreate the wfp object with a copy of the workflow
                    in the appmanager and start the processor.
                    """

                    self._prof.prof('recreating wfp obj', uid=self._uid)
                    self._wfp = WFProcessor(
                        sid=self._sid,
                        workflow=self._workflow,
                        pending_queue=self._pending_queue,
                        completed_queue=self._completed_queue,
                        mq_hostname=self._mq_hostname,
                        port=self._port,
                        resubmit_failed=self._resubmit_failed)

                    self._logger.info('Restarting WFProcessor process from AppManager')
                    self._wfp.start_processor()

                    self._cur_attempt += 1

                if (not self._task_manager.check_heartbeat()) and (self._cur_attempt <= self._reattempts):

                    """
                    If the tmgr process or heartbeat dies, we simply start a
                    new process using the start_manager method. We do not
                    need to create a new instance of the TaskManager object
                    itself. We stop and start a new instance of the
                    heartbeat thread as well.
                    """
                    self._prof.prof('restarting tmgr process and heartbeat', uid=self._uid)

                    self._logger.info('Terminating heartbeat thread')
                    self._task_manager.terminate_heartbeat()
                    self._logger.info('Terminating tmgr process')
                    self._task_manager.terminate_manager()
                    self._logger.info('Restarting task manager process')
                    self._task_manager.start_manager()
                    self._logger.info('Restarting heartbeat thread')
                    self._task_manager.start_heartbeat()

                    self._cur_attempt += 1

            self._prof.prof('start termination', uid=self._uid)

            # Terminate threads in following order: wfp, helper, synchronizer
            self._logger.info('Terminating WFprocessor')
            self._wfp.terminate_processor()

            self._logger.info('Terminating synchronizer thread')
            self._terminate_sync.set()
            self._sync_thread.join()
            self._logger.info('Synchronizer thread terminated')

            if self._autoterminate:
                self._terminate()

            if self._write_workflow:
                write_workflow(self._workflow, self._sid)

            self._prof.prof('termination done', uid=self._uid)

        except KeyboardInterrupt:

            self._prof.prof('start termination', uid=self._uid)

            self._logger.exception('Execution interrupted by user (you probably hit Ctrl+C), ' +
                               'trying to cancel enqueuer thread gracefully...')

            self._terminate()

            self._prof.prof('termination done', uid=self._uid)

            raise KeyboardInterrupt

        except Exception, ex:

            self._prof.prof('start termination', uid=self._uid)

            self._logger.exception('Error in AppManager: %s' % ex)

            # Terminate threads in following order: wfp, helper, synchronizer
            
            self._terminate()

            self._prof.prof('termination done', uid=self._uid)
            raise
Пример #10
0
    def _start_all_comps(self):

        if self._wfp:
            # This condition is called when there are multiple workflows
            # submitted for execution. Amgr.run() was probably called twice.
            # If a WFP exists, we use the same one but with the new workflow.
            # Since WFP (and its threads) and the Amgr share memory, we have
            # to terminate WFP's threads, assign the new workflow and then
            # start the threads again.
            self._wfp.terminate_processor()
            self._wfp._workflow = self._workflow
            self._wfp.start_processor()
            return

        # Create WFProcessor and initialize workflow its contents with
        # uids
        self._prof.prof('wfp_create_start', uid=self._uid)
        self._wfp = WFprocessor(sid=self._sid,
                                workflow=self._workflow,
                                pending_queue=self._pending_queue,
                                completed_queue=self._completed_queue,
                                resubmit_failed=self._resubmit_failed,
                                rmq_conn_params=self._rmq_conn_params)
        self._prof.prof('wfp_create_stop', uid=self._uid)

        # Start synchronizer thread AM OK
        if not self._sync_thread:
            self._logger.info('Starting synchronizer thread')
            self._sync_thread = mt.Thread(target=self._synchronizer,
                                          name='synchronizer-thread')
            self._prof.prof('sync_thread_create', uid=self._uid)
            self._sync_thread.start()

        # Start WFprocessor
        self._logger.info('Starting WFProcessor')
        self._wfp.start_processor()
        self._report.ok('All components created\n')

        # Create tmgr object only if it does not already exist
        if self._rts == 'radical.pilot':
            from radical.entk.execman.rp import TaskManager

        elif self._rts == 'mock':
            from radical.entk.execman.mock import TaskManager

        if not self._task_manager:

            self._logger.info('Starting task manager')
            self._prof.prof('tmgr_create_start', uid=self._uid)

            self._task_manager = TaskManager(
                sid=self._sid,
                pending_queue=self._pending_queue,
                completed_queue=self._completed_queue,
                rmgr=self._rmgr,
                rmq_conn_params=self._rmq_conn_params)

            self._task_manager.start_manager()
            self._task_manager.start_heartbeat()

            self._prof.prof('tmgr_create_stop', uid=self._uid)
Пример #11
0
class AppManager(object):
    '''
    An application manager takes the responsibility of setting up the
    communication infrastructure, instantiates the ResourceManager, TaskManager,
    WFProcessor objects and all their threads and processes. This is the Master
    object running in the main process and is designed to recover from errors
    from all other objects, threads and processes.

    :Arguments:
        :config_path:     Url to config path to be read for AppManager
        :hostname:        host rabbitmq server is running
        :port:            port at which rabbitmq can be accessed
        :username:        username to log in to RabbitMQ
        :password:        password to log in to RabbitMQ
        :reattempts:      number of attempts to re-invoke any failed EnTK
                          components
        :resubmit_failed: resubmit failed tasks (True/False)
        :autoterminate:   terminate resource reservation upon execution of all
                          tasks of first workflow (True/False)
        :write_workflow:  write workflow and mapping to rts entities to a file
                          (post-termination)
        :rts:             Specify RTS to use. Current options: 'mock',
                          'radical.pilot' (default if unspecified)
        :rmq_cleanup:     Cleanup all queues created in RabbitMQ server for
                          current execution (default is True)
        :rts_config:      Configuration for the RTS, accepts
                          {'sandbox_cleanup': True/False,'db_cleanup':
                          True/False} when RTS is RP
        :name:            Name of the Application. It should be unique between
                          executions. (default is randomly assigned)
    '''

    # --------------------------------------------------------------------------
    #
    def __init__(self,
                 config_path=None,
                 hostname=None,
                 port=None,
                 username=None,
                 password=None,
                 reattempts=None,
                 resubmit_failed=None,
                 autoterminate=None,
                 write_workflow=None,
                 rts=None,
                 rmq_cleanup=None,
                 rts_config=None,
                 name=None):

        # Create a session for each EnTK script execution
        if name:
            self._name = name
            self._sid = name
        else:
            self._name = str()
            self._sid = ru.generate_id('re.session', ru.ID_PRIVATE)

        self._read_config(config_path, hostname, port, username, password,
                          reattempts, resubmit_failed, autoterminate,
                          write_workflow, rts, rmq_cleanup, rts_config)

        # Create an uid + logger + profiles for AppManager, under the sid
        # namespace

        self._uid = ru.generate_id('appmanager.%(counter)04d', ru.ID_CUSTOM)

        path = os.getcwd() + '/' + self._sid
        name = 'radical.entk.%s' % self._uid

        self._logger = ru.Logger(name=name, path=path)
        self._prof = ru.Profiler(name=name, path=path)
        self._report = ru.Reporter(name=name)

        self._report.info('EnTK session: %s\n' % self._sid)
        self._report.info('Creating AppManager')
        self._prof.prof('amgr_creat', uid=self._uid)

        self._rmgr = None
        self._pending_queue = list()
        self._completed_queue = list()

        # Global parameters to have default values
        self._mqs_setup = False
        self._resource_desc = None
        self._task_manager = None
        self._workflow = None
        self._workflows = list()
        self._cur_attempt = 1
        self._shared_data = list()
        self._outputs = None
        self._wfp = None
        self._sync_thread = None
        self._terminate_sync = mt.Event()
        self._resubmit_failed = False
        self._port = int(self._port)

        # Setup rabbitmq queues
        self._setup_mqs()

        self._rmq_ping_interval = int(os.getenv('RMQ_PING_INTERVAL', "10"))

        self._logger.info('Application Manager initialized')
        self._prof.prof('amgr_created', uid=self._uid)
        self._report.ok('>>ok\n')

    # --------------------------------------------------------------------------
    #
    def _read_config(self, config_path, hostname, port, username, password,
                     reattempts, resubmit_failed, autoterminate,
                     write_workflow, rts, rmq_cleanup, rts_config):

        if not config_path:
            config_path = os.path.dirname(os.path.abspath(__file__))

        config = ru.read_json(os.path.join(config_path, 'config.json'))

        def _if(val1, val2):
            if val1 is not None: return val1
            else: return val2

        self._hostname = _if(hostname, config['hostname'])
        self._port = _if(port, config['port'])
        self._username = _if(username, config['username'])
        self._password = _if(password, config['password'])
        self._reattempts = _if(reattempts, config['reattempts'])
        self._resubmit_failed = _if(resubmit_failed, config['resubmit_failed'])
        self._autoterminate = _if(autoterminate, config['autoterminate'])
        self._write_workflow = _if(write_workflow, config['write_workflow'])
        self._rmq_cleanup = _if(rmq_cleanup, config['rmq_cleanup'])
        self._rts_config = _if(rts_config, config['rts_config'])
        self._rts = _if(rts, config['rts'])

        credentials = pika.PlainCredentials(self._username, self._password)
        self._rmq_conn_params = pika.connection.ConnectionParameters(
            host=self._hostname, port=self._port, credentials=credentials)

        self._num_pending_qs = config['pending_qs']
        self._num_completed_qs = config['completed_qs']

        if self._rts not in ['radical.pilot', 'mock']:
            raise ValueError('invalid RTS %s' % self._rts)

    # --------------------------------------------------------------------------
    #
    # Getter functions
    #
    @property
    def name(self):
        '''
        Name for the application manager. Allows the user to setup the name of
        the application manager, as well as its session ID. This name should be
        unique between different EnTK executions, otherwise it will produce an
        error.

        :getter: Returns the name of the application manager
        :setter: Assigns the name of the application manager
        :type: String
        '''

        return self._name

    # --------------------------------------------------------------------------
    #
    @property
    def sid(self):
        '''
        Get the session ID of the current EnTK execution

        :getter: Returns the session ID of the EnTK execution
        :type: String
        '''

        return self._sid

    # --------------------------------------------------------------------------
    #
    @property
    def resource_desc(self):
        '''
        :getter: Returns the resource description
        :setter: Assigns a resource description
        '''

        return self._resource_desc

    # --------------------------------------------------------------------------
    #
    @property
    def workflow(self):
        '''
        :getter: Return the last workflow assigned for execution
        :setter: Assign a new workflow to be executed
        '''

        return self._workflow

    # --------------------------------------------------------------------------
    #
    @property
    def workflows(self):
        """
        :getter: Return a list of workflows assigned for execution
        """

        return self._workflows

    @property
    def shared_data(self):
        '''
        :getter: Return list of filenames that are shared between multiple tasks
                 of the application
        :setter: Assign a list of names of files that need to be staged to the
                remote machine
        '''

        return self._shared_data

    @property
    def outputs(self):
        '''
        :getter: Return list of filenames that are to be staged out after
                 execution
        :setter: Assign a list of names of files that need to be staged from the
                 remote machine
        '''

        return self._outputs

    # --------------------------------------------------------------------------
    # Setter functions
    #
    @name.setter
    def name(self, value):

        if not isinstance(value, str):
            raise ree.TypeError(expected_type=str, actual_type=type(value))

        self._name = value

    # --------------------------------------------------------------------------
    #
    @resource_desc.setter
    def resource_desc(self, value):

        if self._rts == 'radical.pilot':
            from radical.entk.execman.rp import ResourceManager

        elif self._rts == 'mock':
            from radical.entk.execman.mock import ResourceManager

        self._rmgr = ResourceManager(resource_desc=value,
                                     sid=self._sid,
                                     rts_config=self._rts_config)

        self._report.info('Validating and assigning resource manager')

        if not self._rmgr._validate_resource_desc():
            self._logger.error('Could not validate resource description')
            raise ree.EnTKError('Could not validate resource description')

        self._rmgr._populate()
        self._rmgr.shared_data = self._shared_data
        self._rmgr.outputs = self._outputs

        self._report.ok('>>ok\n')

    # --------------------------------------------------------------------------
    #
    @workflow.setter
    def workflow(self, workflow):

        self._prof.prof('assigning workflow', uid=self._uid)

        for p in workflow:

            if not isinstance(p, Pipeline):
                self._logger.info('workflow type incorrect')
                raise ree.TypeError(
                    expected_type=['Pipeline', 'set of Pipelines'],
                    actual_type=type(p))
            p._validate()

        # keep history
        self._workflows.append(workflow)

        # set current workflow
        self._workflow = workflow
        self._logger.info('Workflow assigned to Application Manager')

    # --------------------------------------------------------------------------
    #
    @shared_data.setter
    def shared_data(self, data):

        if not isinstance(data, list):
            data = [data]

        for value in data:
            if not isinstance(value, str):
                raise ree.TypeError(expected_type=str, actual_type=type(value))

        self._shared_data = data

        if self._rmgr:
            self._rmgr.shared_data = data

    # --------------------------------------------------------------------------
    #
    @outputs.setter
    def outputs(self, data):

        if not isinstance(data, list):
            data = [data]

        for value in data:
            if not isinstance(value, str):
                raise ree.TypeError(expected_type=str, actual_type=type(value))

        if self._rmgr:
            self._rmgr.outputs = data

    # --------------------------------------------------------------------------
    #
    # Public methods
    #
    def run(self):
        '''
        **Purpose**: Run the application manager. Once the workflow and resource
        manager have been assigned. Invoking this method will start the setting
        up the communication infrastructure, submitting a resource request and
        then submission of all the tasks.
        '''

        ret = None

        try:
            self._prof.prof('amgr_start', uid=self._uid)

            # Set None for variables local to each run
            self._resubmit_failed = False
            self._cur_attempt = 1

            # Ensure that a workflow and a resource description have
            # been defined
            if not self._workflow:
                self._logger.error('No workflow assignedcurrently, please \
                                    check your script')
                raise ree.MissingError(obj=self._uid,
                                       missing_attribute='workflow')

            if not self._rmgr:
                self._logger.error('No resource manager assigned currently, \
                                    please create and add a valid resource \
                                    manager')
                raise ree.MissingError(obj=self._uid,
                                       missing_attribute='resource_manager')
            self._prof.prof('amgr run started', uid=self._uid)

            # ensure rabbitmq setup
            self._setup_mqs()

            # Submit resource request if no resource allocation done till now or
            # resubmit a new one if the old one has completed
            res_alloc_state = self._rmgr.get_resource_allocation_state()
            if not res_alloc_state or \
                   res_alloc_state in self._rmgr.get_completed_states():

                self._logger.info('Starting resource request submission')
                self._prof.prof('rreq_init', uid=self._uid)

                self._rmgr._submit_resource_request()

                res_alloc_state = self._rmgr.get_resource_allocation_state()
                if res_alloc_state in self._rmgr.get_completed_states():
                    raise ree.EnTKError(msg='Cannot proceed. Resource '
                                        'ended in state %s' % res_alloc_state)

            # Start all components and subcomponents
            self._start_all_comps()

            # Run workflow -- this call is blocking till all tasks of the
            # workflow are executed or an error/exception is encountered
            self._run_workflow()
            self._logger.info('Workflow execution finished.')
            if self._autoterminate:
                self._logger.debug('Autoterminate set to %s.' %
                                   self._autoterminate)
                self.terminate()

        except KeyboardInterrupt:

            self._logger.exception('Execution interrupted by user (you '
                                   'probably hit Ctrl+C), trying to cancel '
                                   'enqueuer thread gracefully...')
            self.terminate()
            raise KeyboardInterrupt

        except Exception:

            self._logger.exception('Error in AppManager')
            self.terminate()
            raise

        # return list of fetched output data, or None.
        outputs = self.outputs
        if outputs:
            ret = outputs
        return ret

    # --------------------------------------------------------------------------
    #
    def terminate(self):

        self._prof.prof('term_start', uid=self._uid)

        # Terminate threads in following order: wfp, helper, synchronizer
        if self._wfp:
            self._logger.info('Terminating WFprocessor')
            self._wfp.terminate_processor()

        if self._task_manager:
            self._logger.info('Terminating task manager process')
            self._task_manager.terminate_manager()
            self._task_manager.terminate_heartbeat()

        if self._sync_thread:
            self._logger.info('Terminating synchronizer thread')
            self._terminate_sync.set()
            self._sync_thread.join()
            self._logger.info('Synchronizer thread terminated')

        if self._write_workflow:
            write_workflows(self.workflows, self._sid)

        if self._rmgr:
            self._rmgr._terminate_resource_request()

        if os.environ.get('RADICAL_ENTK_PROFILE'):
            write_session_description(self)

        if self._rmq_cleanup:
            self._cleanup_mqs()

        self._report.info('All components terminated\n')
        self._prof.prof('termination done', uid=self._uid)

    # --------------------------------------------------------------------------
    #
    def resource_terminate(self):

        self._logger.warning('DeprecationWarning: `resource_terminate()` is '
                             'deprecated, please use `terminate()`')
        self.terminate()

    # --------------------------------------------------------------------------
    #
    # Private methods
    #
    def _setup_mqs(self):
        '''
        **Purpose**: Setup RabbitMQ system on the client side. We instantiate
        queue(s) 'pendingq-*' for communication between the enqueuer thread and
        the task manager process. We instantiate queue(s) 'completedq-*' for
        communication between the task manager and dequeuer thread. We
        instantiate queue 'sync-to-master' for communication from
        enqueuer/dequeuer/task_manager to the synchronizer thread. We
        instantiate queue 'sync-ack' for communication from synchronizer thread
        to enqueuer/dequeuer/task_manager.

        Details: All queues are durable: Even if the RabbitMQ server goes down,
        the queues are saved to disk and can be retrieved. This also means that
        after an erroneous run the queues might still have unacknowledged
        messages and will contain messages from that run. Hence, in every new
        run, we first delete the queue and create a new one.
        '''

        try:
            self._report.info('Setting up RabbitMQ system')
            if self._mqs_setup:
                self._report.ok('>>n/a\n')
                return

            self._report.ok('>>ok\n')

            self._prof.prof('mqs_setup_start', uid=self._uid)
            self._logger.debug('Setting up mq connection and channel')

            mq_connection = pika.BlockingConnection(self._rmq_conn_params)

            mq_channel = mq_connection.channel()

            self._logger.debug('Connection and channel setup successful')
            self._logger.debug('Setting up all exchanges and queues')

            qs = [
                '%s-tmgr-to-sync' % self._sid,
                '%s-cb-to-sync' % self._sid,
                '%s-sync-to-tmgr' % self._sid,
                '%s-sync-to-cb' % self._sid
            ]

            for i in range(1, self._num_pending_qs + 1):
                queue_name = '%s-pendingq-%s' % (self._sid, i)
                self._pending_queue.append(queue_name)
                qs.append(queue_name)

            for i in range(1, self._num_completed_qs + 1):
                queue_name = '%s-completedq-%s' % (self._sid, i)
                self._completed_queue.append(queue_name)
                qs.append(queue_name)

        # f = open('.%s.txt' % self._sid, 'w')
            for q in qs:
                # Durable Qs will not be lost if rabbitmq server crashes
                mq_channel.queue_declare(queue=q)
        #     f.write(q + '\n')
        # f.close()

            self._mqs_setup = True

            self._logger.debug('All exchanges and queues are setup')
            self._prof.prof('mqs_setup_stop', uid=self._uid)

        except Exception as ex:

            self._logger.exception('Error setting RabbitMQ system: %s' % ex)
            raise

    # --------------------------------------------------------------------------
    #
    def _cleanup_mqs(self):

        try:
            self._prof.prof('mqs_cleanup_start', uid=self._uid)

            mq_connection = pika.BlockingConnection(self._rmq_conn_params)
            mq_channel = mq_connection.channel()

            mq_channel.queue_delete(queue='%s-tmgr-to-sync' % self._sid)
            mq_channel.queue_delete(queue='%s-cb-to-sync' % self._sid)
            mq_channel.queue_delete(queue='%s-sync-to-tmgr' % self._sid)
            mq_channel.queue_delete(queue='%s-sync-to-cb' % self._sid)

            for i in range(1, self._num_pending_qs + 1):
                queue_name = '%s-pendingq-%s' % (self._sid, i)
                mq_channel.queue_delete(queue=queue_name)

            for i in range(1, self._num_completed_qs + 1):
                queue_name = '%s-completedq-%s' % (self._sid, i)
                mq_channel.queue_delete(queue=queue_name)

            self._prof.prof('mqs_cleanup_stop', uid=self._uid)

            self._mqs_setup = False

        except Exception:
            self._logger.exception('Message queues not deleted, error')
            raise

    # --------------------------------------------------------------------------
    #
    def _start_all_comps(self):

        if self._wfp:
            # This condition is called when there are multiple workflows
            # submitted for execution. Amgr.run() was probably called twice.
            # If a WFP exists, we use the same one but with the new workflow.
            # Since WFP (and its threads) and the Amgr share memory, we have
            # to terminate WFP's threads, assign the new workflow and then
            # start the threads again.
            self._wfp.terminate_processor()
            self._wfp._workflow = self._workflow
            self._wfp.start_processor()
            return

        # Create WFProcessor and initialize workflow its contents with
        # uids
        self._prof.prof('wfp_create_start', uid=self._uid)
        self._wfp = WFprocessor(sid=self._sid,
                                workflow=self._workflow,
                                pending_queue=self._pending_queue,
                                completed_queue=self._completed_queue,
                                resubmit_failed=self._resubmit_failed,
                                rmq_conn_params=self._rmq_conn_params)
        self._prof.prof('wfp_create_stop', uid=self._uid)

        # Start synchronizer thread AM OK
        if not self._sync_thread:
            self._logger.info('Starting synchronizer thread')
            self._sync_thread = mt.Thread(target=self._synchronizer,
                                          name='synchronizer-thread')
            self._prof.prof('sync_thread_create', uid=self._uid)
            self._sync_thread.start()

        # Start WFprocessor
        self._logger.info('Starting WFProcessor')
        self._wfp.start_processor()
        self._report.ok('All components created\n')

        # Create tmgr object only if it does not already exist
        if self._rts == 'radical.pilot':
            from radical.entk.execman.rp import TaskManager

        elif self._rts == 'mock':
            from radical.entk.execman.mock import TaskManager

        if not self._task_manager:

            self._logger.info('Starting task manager')
            self._prof.prof('tmgr_create_start', uid=self._uid)

            self._task_manager = TaskManager(
                sid=self._sid,
                pending_queue=self._pending_queue,
                completed_queue=self._completed_queue,
                rmgr=self._rmgr,
                rmq_conn_params=self._rmq_conn_params)

            self._task_manager.start_manager()
            self._task_manager.start_heartbeat()

            self._prof.prof('tmgr_create_stop', uid=self._uid)

    # --------------------------------------------------------------------------
    #
    def _run_workflow(self):

        active_pipe_count = len(self._workflow)
        finished_pipe_uids = list()

        # We wait till all pipelines of the workflow are marked
        # complete
        state = self._rmgr.get_resource_allocation_state()
        final = self._rmgr.get_completed_states()
        incomplete = self._wfp.workflow_incomplete()

        while active_pipe_count and \
              incomplete        and \
              state not in final:

            state = self._rmgr.get_resource_allocation_state()

            for pipe in self._workflow:

                with pipe.lock:

                    if pipe.completed and \
                        pipe.uid not in finished_pipe_uids:

                        finished_pipe_uids.append(pipe.uid)
                        active_pipe_count -= 1

                        self._logger.info('Pipe %s completed' % pipe.uid)
                        self._logger.info('Active pipes %s' %
                                          active_pipe_count)


            if not self._sync_thread.is_alive() and \
                self._cur_attempt <= self._reattempts:

                self._sync_thread = mt.Thread(target=self._synchronizer,
                                              name='synchronizer-thread')
                self._sync_thread.start()
                self._cur_attempt += 1

                self._prof.prof('sync_thread_restart', uid=self._uid)
                self._logger.info('Restarting synchronizer thread')


            if not self._wfp.check_processor() and \
                self._cur_attempt <= self._reattempts:

                # If WFP dies, both child threads are also cleaned out.
                # We simply recreate the wfp object with a copy of the
                # workflow in the appmanager and start the processor.

                self._prof.prof('wfp_recreate', uid=self._uid)
                self._wfp = WFprocessor(sid=self._sid,
                                        workflow=self._workflow,
                                        pending_queue=self._pending_queue,
                                        completed_queue=self._completed_queue,
                                        resubmit_failed=self._resubmit_failed,
                                        rmq_conn_params=self._rmq_conn_params)

                self._logger.info('Restarting WFProcessor')
                self._wfp.start_processor()

                self._cur_attempt += 1


            if not self._task_manager.check_heartbeat() and \
                self._cur_attempt <= self._reattempts:

                # If the tmgr process or heartbeat dies, we simply start a
                # new process using the start_manager method. We do not
                # need to create a new instance of the TaskManager object
                # itself. We stop and start a new instance of the
                # heartbeat thread as well.

                self._prof.prof('restart_tmgr', uid=self._uid)

                self._logger.info('Terminating heartbeat thread')
                self._task_manager.terminate_heartbeat()
                self._logger.info('Terminating tmgr process')
                self._task_manager.terminate_manager()

                self._logger.info('Restarting task manager process')
                self._task_manager.start_manager()
                self._logger.info('Restarting heartbeat thread')
                self._task_manager.start_heartbeat()

                self._cur_attempt += 1

    # --------------------------------------------------------------------------
    #
    def _get_message_to_sync(self, mq_channel, qname):
        '''
        Reads a message from the queue, and exchange the message to where it
        was published by `update_task`
        '''

        # --------------------------------------------------------------
        # Messages between tmgr Main thread and synchronizer -- only
        # Task objects
        method_frame, props, body = mq_channel.basic_get(queue=qname)
        tmp = qname.split("-")
        q_sid = ''.join(tmp[:-3])
        q_from = tmp[-3]
        q_to = tmp[-1]
        return_queue_name = f"{q_sid}-{q_to}-to-{q_from}"

        # The message received is a JSON object with the following
        # structure:
        # msg = {
        #         'type': 'Pipeline'/'Stage'/'Task',
        #         'object': json/dict
        #         }
        if body:

            msg = json.loads(body)
            uid = msg['object']['uid']
            state = msg['object']['state']

            self._prof.prof('sync_recv_obj_state_%s' % state, uid=uid)
            self._logger.debug('recv %s in state %s (sync)' % (uid, state))

            if msg['type'] == 'Task':
                self._update_task(msg, return_queue_name, props.correlation_id,
                                  mq_channel, method_frame)

    # --------------------------------------------------------------------------
    #
    def _update_task(self, msg, reply_to, corr_id, mq_channel, method_frame):
        # pylint: disable=W0612,W0613

        completed_task = Task()
        completed_task.from_dict(msg['object'])

        self._logger.info('Received %s with state %s' %
                          (completed_task.uid, completed_task.state))

        # found_task = False

        # Traverse the entire workflow to find the correct task
        for pipe in self._workflow:

            with pipe.lock:

                if pipe.completed or \
                    pipe.uid != completed_task.parent_pipeline['uid']:
                    continue

                for stage in pipe.stages:

                    if stage.uid != completed_task.parent_stage['uid']:
                        continue

                    for task in stage.tasks:

                        if completed_task.uid != task.uid or \
                            completed_task.state == task.state:
                            continue

                        self._logger.debug(
                            ('Found task %s in state (%s)'
                             ' changing to %s ==') %
                            (task.uid, task.state, completed_task.state))
                        if task.state in [states.DONE, states.FAILED]:
                            self._logger.debug(
                                ('No change on task state %s '
                                 'in state %s') % (task.uid, task.state))
                            break
                        task.state = str(completed_task.state)
                        self._logger.debug('Found task %s in state %s' %
                                           (task.uid, task.state))

                        if completed_task.path:
                            task.path = str(completed_task.path)

                        # mq_channel.basic_publish(
                        #        exchange='',
                        #        routing_key=reply_to,
                        #        properties=pika.BasicProperties(
                        #            correlation_id=corr_id),
                        #        body='%s-ack' % task.uid)

                        state = msg['object']['state']
                        self._prof.prof('pub_ack_state_%s' % state,
                                        uid=msg['object']['uid'])

                        mq_channel.basic_ack(
                            delivery_tag=method_frame.delivery_tag)

                        self._report.ok('Update: ')
                        self._report.info('%s state: %s\n' %
                                          (task.luid, task.state))

                        # found_task = True
                        break

                # if not found_task:
                #
                #     # If there was a Stage update, but the Stage was
                #     # not found in any of the Pipelines. This
                #     # means that this was a Stage that was added
                #     # during runtime and the AppManager does not
                #     # know about it. The current solution is going
                #     # to be: add it to the workflow object in the
                #     # AppManager via the synchronizer.
                #
                #     self._logger.info('Adding new task %s to \
                #                         parent stage: %s'
                #                         % (completed_task.uid,
                #                         stage.uid))
                #
                #     self._prof.prof('adap_add_task_start',
                #                     uid=completed_task.uid)
                #     stage.add_tasks(completed_task)
                #     self._prof.prof('adap_add_task_stop',
                #                     uid=completed_task.uid)
                #
                #     mq_channel.basic_publish(exchange='',
                #                 routing_key=reply_to,
                #                 properties=pika.BasicProperties(
                #                     correlation_id=corr_id),
                #                 body='%s-ack' % completed_task.uid)
                #
                #     self._prof.prof('pub_ack_state_%s' %
                #                 msg['object']['state'],
                #                 uid=msg['object']['uid'])
                #
                #     mq_channel.basic_ack(
                #         delivery_tag=method_frame.delivery_tag)
                #     self._report.ok('Update: ')
                #     self._report.info('%s state: %s\n' %
                #     (completed_task.luid, completed_task.state))

    # --------------------------------------------------------------------------
    #
    def _synchronizer(self):

        try:
            self._synchronizer_work()

        except KeyboardInterrupt:
            self._logger.exception('Execution interrupted by user (you \
                                    probably hit Ctrl+C), trying to terminate \
                                    synchronizer thread gracefully...')
            raise

        except Exception:
            self._logger.exception('Unknown error in synchronizer: %s. \
                                    Terminating thread')
            raise

    # --------------------------------------------------------------------------
    #
    def _synchronizer_work(self):
        '''
        **Purpose**: Thread in the master process to keep the workflow data
                     structure in appmanager up to date. We receive only tasks
                     objects from the task manager.

        Details:     Important to note that acknowledgements of the type
                     `channel.basic_ack()` is an acknowledgement to the server
                     that the msg was received.  This is not to be confused with
                     the Ack sent to the task_manager through the sync-ack
                     queue.
        '''

        self._prof.prof('sync_thread_start', uid=self._uid)
        self._logger.info('synchronizer thread started')

        mq_connection = pika.BlockingConnection(self._rmq_conn_params)
        mq_channel = mq_connection.channel()

        last = time.time()
        qname_t2s = '%s-tmgr-to-sync' % self._sid
        qname_c2s = '%s-cb-to-sync' % self._sid

        while not self._terminate_sync.is_set():

            # wrapper to call `_update_task()`
            self._get_message_to_sync(mq_channel, qname_t2s)
            self._get_message_to_sync(mq_channel, qname_c2s)

            # Appease pika cos it thinks the connection is dead
            now = time.time()
            if now - last >= self._rmq_ping_interval:
                mq_connection.process_data_events()
                last = now

        self._prof.prof('sync_thread_stop', uid=self._uid)
Пример #12
0
    def run(self):
        """
        **Purpose**: Run the application manager. Once the workflow and resource manager have been assigned. Invoking this
        method will start the setting up the communication infrastructure, submitting a resource request and then
        submission of all the tasks.
        """

        try:

            # Set None objects local to each run
            self._wfp = None
            self._sync_thread = None
            self._terminate_sync = Event()
            self._resubmit_failed = False
            self._cur_attempt = 1

            if not self._workflow:
                self._logger.error(
                    'No workflow assigned currently, please check your script')
                raise MissingError(obj=self._uid, missing_attribute='workflow')

            if not self._resource_manager:
                self._logger.error(
                    'No resource manager assigned currently, please create and add a valid resource manager'
                )
                raise MissingError(obj=self._uid,
                                   missing_attribute='resource_manager')

            self._prof.prof('amgr run started', uid=self._uid)

            # Setup rabbitmq stuff
            if not self._mqs_setup:

                self._report.info('Setting up RabbitMQ system')
                setup = self._setup_mqs()

                if not setup:
                    self._logger.error('RabbitMQ system not available')
                    raise EnTKError("RabbitMQ setup failed")

                self._mqs_setup = True

                self._report.ok('>>ok\n')

            # Create WFProcessor object
            self._prof.prof('creating wfp obj', uid=self._uid)
            self._wfp = WFprocessor(sid=self._sid,
                                    workflow=self._workflow,
                                    pending_queue=self._pending_queue,
                                    completed_queue=self._completed_queue,
                                    mq_hostname=self._mq_hostname,
                                    port=self._port,
                                    resubmit_failed=self._resubmit_failed)
            self._wfp._initialize_workflow()
            self._workflow = self._wfp.workflow

            # Submit resource request if not resource allocation done till now or
            # resubmit a new one if the old one has completed
            if self._resource_manager:
                res_alloc_state = self._resource_manager.get_resource_allocation_state(
                )
                if (not res_alloc_state) or (
                        res_alloc_state
                        in self._resource_manager.get_completed_states()):

                    self._logger.info('Starting resource request submission')
                    self._prof.prof('init rreq submission', uid=self._uid)
                    self._resource_manager._submit_resource_request()

            else:

                self._logger.error(
                    'Cannot run without resource manager, please create and assign a resource manager'
                )
                raise EnTKError(text='Missing resource manager')

            # Start synchronizer thread
            if not self._sync_thread:
                self._logger.info('Starting synchronizer thread')
                self._sync_thread = Thread(target=self._synchronizer,
                                           name='synchronizer-thread')
                self._prof.prof('starting synchronizer thread', uid=self._uid)
                self._sync_thread.start()

            # Start WFprocessor
            self._logger.info('Starting WFProcessor process from AppManager')
            self._wfp.start_processor()

            self._report.ok('All components created\n')

            # Create tmgr object only if it does not already exist
            if self._rts == 'radical.pilot':
                from radical.entk.execman.rp import TaskManager
            elif self._rts == 'mock':
                from radical.entk.execman.mock import TaskManager

            if not self._task_manager:
                self._prof.prof('creating tmgr obj', uid=self._uid)
                self._task_manager = TaskManager(
                    sid=self._sid,
                    pending_queue=self._pending_queue,
                    completed_queue=self._completed_queue,
                    mq_hostname=self._mq_hostname,
                    rmgr=self._resource_manager,
                    port=self._port)
                self._logger.info(
                    'Starting task manager process from AppManager')
                self._task_manager.start_manager()
                self._task_manager.start_heartbeat()

            active_pipe_count = len(self._workflow)
            finished_pipe_uids = []

            # We wait till all pipelines of the workflow are marked
            # complete
            while ((active_pipe_count > 0)
                   and (self._wfp.workflow_incomplete())
                   and (self._resource_manager.get_resource_allocation_state()
                        not in self._resource_manager.get_completed_states())):

                if active_pipe_count > 0:

                    for pipe in self._workflow:

                        with pipe.lock:

                            if (pipe.completed) and (
                                    pipe.uid not in finished_pipe_uids):

                                self._logger.info('Pipe %s completed' %
                                                  pipe.uid)
                                finished_pipe_uids.append(pipe.uid)
                                active_pipe_count -= 1
                                self._logger.info('Active pipes: %s' %
                                                  active_pipe_count)

                if (not self._sync_thread.is_alive()) and (self._cur_attempt <=
                                                           self._reattempts):

                    self._sync_thread = Thread(target=self._synchronizer,
                                               name='synchronizer-thread')
                    self._logger.info('Restarting synchronizer thread')
                    self._prof.prof('restarting synchronizer', uid=self._uid)
                    self._sync_thread.start()

                    self._cur_attempt += 1

                if (not self._wfp.check_processor()) and (self._cur_attempt <=
                                                          self._reattempts):
                    """
                    If WFP dies, both child threads are also cleaned out.
                    We simply recreate the wfp object with a copy of the workflow
                    in the appmanager and start the processor.
                    """

                    self._prof.prof('recreating wfp obj', uid=self._uid)
                    self._wfp = WFProcessor(
                        sid=self._sid,
                        workflow=self._workflow,
                        pending_queue=self._pending_queue,
                        completed_queue=self._completed_queue,
                        mq_hostname=self._mq_hostname,
                        port=self._port,
                        resubmit_failed=self._resubmit_failed)

                    self._logger.info(
                        'Restarting WFProcessor process from AppManager')
                    self._wfp.start_processor()

                    self._cur_attempt += 1

                if (not self._task_manager.check_heartbeat()) and (
                        self._cur_attempt <= self._reattempts):
                    """
                    If the tmgr process or heartbeat dies, we simply start a
                    new process using the start_manager method. We do not
                    need to create a new instance of the TaskManager object
                    itself. We stop and start a new instance of the
                    heartbeat thread as well.
                    """
                    self._prof.prof('restarting tmgr process and heartbeat',
                                    uid=self._uid)

                    self._logger.info('Terminating heartbeat thread')
                    self._task_manager.terminate_heartbeat()
                    self._logger.info('Terminating tmgr process')
                    self._task_manager.terminate_manager()
                    self._logger.info('Restarting task manager process')
                    self._task_manager.start_manager()
                    self._logger.info('Restarting heartbeat thread')
                    self._task_manager.start_heartbeat()

                    self._cur_attempt += 1

            self._prof.prof('start termination', uid=self._uid)

            # Terminate threads in following order: wfp, helper, synchronizer
            self._logger.info('Terminating WFprocessor')
            self._wfp.terminate_processor()

            self._logger.info('Terminating synchronizer thread')
            self._terminate_sync.set()
            self._sync_thread.join()
            self._logger.info('Synchronizer thread terminated')

            if self._autoterminate:
                self.resource_terminate()

            if self._write_workflow:
                write_workflow(self._workflow, self._sid)

            self._prof.prof('termination done', uid=self._uid)

        except KeyboardInterrupt:

            self._prof.prof('start termination', uid=self._uid)

            self._logger.error(
                'Execution interrupted by user (you probably hit Ctrl+C), ' +
                'trying to cancel enqueuer thread gracefully...')

            # Terminate threads in following order: wfp, helper, synchronizer
            if self._wfp:
                self._logger.info('Terminating WFprocessor')
                self._wfp.terminate_processor()

            if self._task_manager:
                self._logger.info('Terminating task manager process')
                self._task_manager.terminate_manager()
                self._task_manager.terminate_heartbeat()

            if self._sync_thread:
                self._logger.info('Terminating synchronizer thread')
                self._terminate_sync.set()
                self._sync_thread.join()
                self._logger.info('Synchronizer thread terminated')

            if self._resource_manager:
                self._resource_manager._terminate_resource_request()

            self._prof.prof('termination done', uid=self._uid)

            raise KeyboardInterrupt

        except Exception, ex:

            self._prof.prof('start termination', uid=self._uid)

            self._logger.exception('Error in AppManager: %s' % ex)

            # Terminate threads in following order: wfp, helper, synchronizer
            if self._wfp:
                self._logger.info('Terminating WFprocessor')
                self._wfp.terminate_processor()

            if self._task_manager:
                self._logger.info('Terminating task manager process')
                self._task_manager.terminate_manager()
                self._task_manager.terminate_heartbeat()

            if self._sync_thread:
                self._logger.info('Terminating synchronizer thread')
                self._terminate_sync.set()
                self._sync_thread.join()
                self._logger.info('Synchronizer thread terminated')

            if self._resource_manager:
                self._resource_manager._terminate_resource_request()

            self._prof.prof('termination done', uid=self._uid)
            raise
Пример #13
0
class AppManager(object):
    """
    An application manager takes the responsibility of setting up the communication infrastructure, instantiates the
    ResourceManager, TaskManager, WFProcessor objects and all their threads and processes. This is the Master object
    running in the main process and is designed to recover from errors from all other objects, threads and processes.

    :Arguments:
        :config_path: Url to config path to be read for AppManager
        :hostname: host rabbitmq server is running
        :port: port at which rabbitmq can be accessed
        :reattempts: number of attempts to re-invoke any failed EnTK components
        :resubmit_failed: resubmit failed tasks (True/False)
        :autoterminate: terminate resource reservation upon execution of all tasks of first workflow (True/False)
        :write_workflow: write workflow and mapping to rts entities to a file (post-termination)
        :rts: Specify RTS to use. Current options: 'mock', 'radical.pilot' (default if unspecified)
        :rmq_cleanup: Cleanup all queues created in RabbitMQ server for current execution (default is True)
        :rts_config: Configuration for the RTS, accepts {"sandbox_cleanup": True/False,"db_cleanup": True/False} when RTS is RP
        :name: Name of the Application. It should be unique between executions. (default is randomly assigned)
    """
    def __init__(self,
                 config_path=None,
                 hostname=None,
                 port=None,
                 reattempts=None,
                 resubmit_failed=None,
                 autoterminate=None,
                 write_workflow=None,
                 rts=None,
                 rmq_cleanup=None,
                 rts_config=None,
                 name=None):

        # Create a session for each EnTK script execution
        if name:
            self._name = name
            self._sid = name
        else:
            self._name = str()
            self._sid = ru.generate_id('re.session', ru.ID_PRIVATE)

        self._read_config(config_path, hostname, port, reattempts,
                          resubmit_failed, autoterminate, write_workflow, rts,
                          rmq_cleanup, rts_config)

        # Create an uid + logger + profiles for AppManager, under the sid
        # namespace
        path = os.getcwd() + '/' + self._sid
        self._uid = ru.generate_id('appmanager.%(item_counter)04d',
                                   ru.ID_CUSTOM,
                                   namespace=self._sid)
        self._logger = ru.Logger('radical.entk.%s' % self._uid,
                                 path=path,
                                 targets=['2', '.'])
        self._prof = ru.Profiler(name='radical.entk.%s' % self._uid, path=path)
        self._report = ru.Reporter(name='radical.entk.%s' % self._uid)

        self._report.info('EnTK session: %s\n' % self._sid)
        self._prof.prof('create amgr obj', uid=self._uid)
        self._report.info('Creating AppManager')

        self._resource_manager = None
        # RabbitMQ Queues
        self._pending_queue = list()
        self._completed_queue = list()

        # Global parameters to have default values
        self._mqs_setup = False
        self._resource_desc = None
        self._task_manager = None
        self._workflow = None
        self._cur_attempt = 1
        self._shared_data = list()

        self._rmq_ping_interval = os.getenv('RMQ_PING_INTERVAL', 10)

        self._logger.info('Application Manager initialized')
        self._prof.prof('amgr obj created', uid=self._uid)
        self._report.ok('>>ok\n')

    def _read_config(self, config_path, hostname, port, reattempts,
                     resubmit_failed, autoterminate, write_workflow, rts,
                     rmq_cleanup, rts_config):

        if not config_path:
            config_path = os.path.dirname(os.path.abspath(__file__))

        config = ru.read_json(os.path.join(config_path, 'config.json'))

        self._mq_hostname = hostname if hostname else str(config['hostname'])
        self._port = port if port else config['port']
        self._reattempts = reattempts if reattempts else config['reattempts']
        self._resubmit_failed = resubmit_failed if resubmit_failed is not None else config[
            'resubmit_failed']
        self._autoterminate = autoterminate if autoterminate is not None else config[
            'autoterminate']
        self._write_workflow = write_workflow if write_workflow is not None else config[
            'write_workflow']
        self._rts = rts if rts in ['radical.pilot', 'mock'] else str(
            config['rts'])
        self._rmq_cleanup = rmq_cleanup if rmq_cleanup is not None else config[
            'rmq_cleanup']
        self._rts_config = rts_config if rts_config is not None else config[
            'rts_config']

        self._num_pending_qs = config['pending_qs']
        self._num_completed_qs = config['completed_qs']

    # ------------------------------------------------------------------------------------------------------------------
    # Getter functions
    # ------------------------------------------------------------------------------------------------------------------

    @property
    def name(self):
        """
        Name for the application manager. Allows the user to setup the name of
        the application manager, as well as, its session ID. This name should be
        unique between different EnTK executions, otherwise it will produce an
        error.

        :getter: Returns the name of the application manager
        :setter: Assigns the name of the application manager
        :type: String
        """

        return self._name

    @property
    def sid(self):
        """
        Get the session ID of the current EnTK execution

        :getter: Returns the session ID of the EnTK execution
        :type: String
        """

        return self._sid

    @property
    def resource_desc(self):
        """
        :getter: Returns the resource description
        :setter: Assigns a resource description
        """

        return self._resource_desc

    @property
    def workflow(self):
        """
        :getter: Return the workflow assigned for execution
        :setter: Assign workflow to be executed
        """

        return self._workflow

    @property
    def shared_data(self):
        """
        :getter: Return list of filenames that are shared between multiple tasks of the application
        :setter: Assign a list of names of files that need to be staged to the remote machine
        """

        return self._shared_data

    # ------------------------------------------------------------------------------------------------------------------
    # Setter functions
    # ------------------------------------------------------------------------------------------------------------------

    @name.setter
    def name(self, value):

        if not isinstance(value, str):
            raise TypeError(expected_type=str, actual_type=type(value))

        else:
            self._name = value

    @resource_desc.setter
    def resource_desc(self, value):

        if self._rts == 'radical.pilot':
            from radical.entk.execman.rp import ResourceManager
            self._resource_manager = ResourceManager(
                resource_desc=value,
                sid=self._sid,
                rts_config=self._rts_config)
        elif self._rts == 'mock':
            from radical.entk.execman.mock import ResourceManager
            self._resource_manager = ResourceManager(resource_desc=value,
                                                     sid=self._sid)

        self._report.info('Validating and assigning resource manager')

        if self._resource_manager._validate_resource_desc():
            self._resource_manager._populate()
            self._resource_manager.shared_data = self._shared_data
        else:
            self._logger.error('Could not validate resource description')
            raise
        self._report.ok('>>ok\n')

    @workflow.setter
    def workflow(self, workflow):

        self._prof.prof('assigning workflow', uid=self._uid)

        for p in workflow:
            if not isinstance(p, Pipeline):
                self._logger.info('workflow type incorrect')
                raise TypeError(expected_type=['Pipeline', 'set of Pipelines'],
                                actual_type=type(p))

            p._validate()

        self._workflow = workflow
        self._logger.info('Workflow assigned to Application Manager')

    @shared_data.setter
    def shared_data(self, data):

        if not isinstance(data, list):
            data = [data]

        for val in data:
            if not isinstance(val, str):
                raise TypeError(expected_type=str, actual_type=type(val))

        if self._resource_manager:
            self._resource_manager.shared_data = data

    # ------------------------------------------------------------------------------------------------------------------
    # Public methods
    # ------------------------------------------------------------------------------------------------------------------

    def run(self):
        """
        **Purpose**: Run the application manager. Once the workflow and resource manager have been assigned. Invoking this
        method will start the setting up the communication infrastructure, submitting a resource request and then
        submission of all the tasks.
        """

        try:

            # Set None objects local to each run
            self._wfp = None
            self._sync_thread = None
            self._terminate_sync = Event()
            self._resubmit_failed = False
            self._cur_attempt = 1

            if not self._workflow:
                self._logger.error(
                    'No workflow assigned currently, please check your script')
                raise MissingError(obj=self._uid, missing_attribute='workflow')

            if not self._resource_manager:
                self._logger.error(
                    'No resource manager assigned currently, please create and add a valid resource manager'
                )
                raise MissingError(obj=self._uid,
                                   missing_attribute='resource_manager')

            self._prof.prof('amgr run started', uid=self._uid)

            # Setup rabbitmq stuff
            if not self._mqs_setup:

                self._report.info('Setting up RabbitMQ system')
                setup = self._setup_mqs()

                if not setup:
                    self._logger.error('RabbitMQ system not available')
                    raise EnTKError("RabbitMQ setup failed")

                self._mqs_setup = True

                self._report.ok('>>ok\n')

            # Create WFProcessor object
            self._prof.prof('creating wfp obj', uid=self._uid)
            self._wfp = WFprocessor(sid=self._sid,
                                    workflow=self._workflow,
                                    pending_queue=self._pending_queue,
                                    completed_queue=self._completed_queue,
                                    mq_hostname=self._mq_hostname,
                                    port=self._port,
                                    resubmit_failed=self._resubmit_failed)
            self._wfp._initialize_workflow()
            self._workflow = self._wfp.workflow

            # Submit resource request if not resource allocation done till now or
            # resubmit a new one if the old one has completed
            if self._resource_manager:
                res_alloc_state = self._resource_manager.get_resource_allocation_state(
                )
                if (not res_alloc_state) or (
                        res_alloc_state
                        in self._resource_manager.get_completed_states()):

                    self._logger.info('Starting resource request submission')
                    self._prof.prof('init rreq submission', uid=self._uid)
                    self._resource_manager._submit_resource_request()

            else:

                self._logger.error(
                    'Cannot run without resource manager, please create and assign a resource manager'
                )
                raise EnTKError(text='Missing resource manager')

            # Start synchronizer thread
            if not self._sync_thread:
                self._logger.info('Starting synchronizer thread')
                self._sync_thread = Thread(target=self._synchronizer,
                                           name='synchronizer-thread')
                self._prof.prof('starting synchronizer thread', uid=self._uid)
                self._sync_thread.start()

            # Start WFprocessor
            self._logger.info('Starting WFProcessor process from AppManager')
            self._wfp.start_processor()

            self._report.ok('All components created\n')

            # Create tmgr object only if it does not already exist
            if self._rts == 'radical.pilot':
                from radical.entk.execman.rp import TaskManager
            elif self._rts == 'mock':
                from radical.entk.execman.mock import TaskManager

            if not self._task_manager:
                self._prof.prof('creating tmgr obj', uid=self._uid)
                self._task_manager = TaskManager(
                    sid=self._sid,
                    pending_queue=self._pending_queue,
                    completed_queue=self._completed_queue,
                    mq_hostname=self._mq_hostname,
                    rmgr=self._resource_manager,
                    port=self._port)
                self._logger.info(
                    'Starting task manager process from AppManager')
                self._task_manager.start_manager()
                self._task_manager.start_heartbeat()

            active_pipe_count = len(self._workflow)
            finished_pipe_uids = []

            # We wait till all pipelines of the workflow are marked
            # complete
            while ((active_pipe_count > 0)
                   and (self._wfp.workflow_incomplete())
                   and (self._resource_manager.get_resource_allocation_state()
                        not in self._resource_manager.get_completed_states())):

                if active_pipe_count > 0:

                    for pipe in self._workflow:

                        with pipe.lock:

                            if (pipe.completed) and (
                                    pipe.uid not in finished_pipe_uids):

                                self._logger.info('Pipe %s completed' %
                                                  pipe.uid)
                                finished_pipe_uids.append(pipe.uid)
                                active_pipe_count -= 1
                                self._logger.info('Active pipes: %s' %
                                                  active_pipe_count)

                if (not self._sync_thread.is_alive()) and (self._cur_attempt <=
                                                           self._reattempts):

                    self._sync_thread = Thread(target=self._synchronizer,
                                               name='synchronizer-thread')
                    self._logger.info('Restarting synchronizer thread')
                    self._prof.prof('restarting synchronizer', uid=self._uid)
                    self._sync_thread.start()

                    self._cur_attempt += 1

                if (not self._wfp.check_processor()) and (self._cur_attempt <=
                                                          self._reattempts):
                    """
                    If WFP dies, both child threads are also cleaned out.
                    We simply recreate the wfp object with a copy of the workflow
                    in the appmanager and start the processor.
                    """

                    self._prof.prof('recreating wfp obj', uid=self._uid)
                    self._wfp = WFProcessor(
                        sid=self._sid,
                        workflow=self._workflow,
                        pending_queue=self._pending_queue,
                        completed_queue=self._completed_queue,
                        mq_hostname=self._mq_hostname,
                        port=self._port,
                        resubmit_failed=self._resubmit_failed)

                    self._logger.info(
                        'Restarting WFProcessor process from AppManager')
                    self._wfp.start_processor()

                    self._cur_attempt += 1

                if (not self._task_manager.check_heartbeat()) and (
                        self._cur_attempt <= self._reattempts):
                    """
                    If the tmgr process or heartbeat dies, we simply start a
                    new process using the start_manager method. We do not
                    need to create a new instance of the TaskManager object
                    itself. We stop and start a new instance of the
                    heartbeat thread as well.
                    """
                    self._prof.prof('restarting tmgr process and heartbeat',
                                    uid=self._uid)

                    self._logger.info('Terminating heartbeat thread')
                    self._task_manager.terminate_heartbeat()
                    self._logger.info('Terminating tmgr process')
                    self._task_manager.terminate_manager()
                    self._logger.info('Restarting task manager process')
                    self._task_manager.start_manager()
                    self._logger.info('Restarting heartbeat thread')
                    self._task_manager.start_heartbeat()

                    self._cur_attempt += 1

            self._prof.prof('start termination', uid=self._uid)

            # Terminate threads in following order: wfp, helper, synchronizer
            self._logger.info('Terminating WFprocessor')
            self._wfp.terminate_processor()

            self._logger.info('Terminating synchronizer thread')
            self._terminate_sync.set()
            self._sync_thread.join()
            self._logger.info('Synchronizer thread terminated')

            if self._autoterminate:
                self.resource_terminate()

            if self._write_workflow:
                write_workflow(self._workflow, self._sid)

            self._prof.prof('termination done', uid=self._uid)

        except KeyboardInterrupt:

            self._prof.prof('start termination', uid=self._uid)

            self._logger.error(
                'Execution interrupted by user (you probably hit Ctrl+C), ' +
                'trying to cancel enqueuer thread gracefully...')

            # Terminate threads in following order: wfp, helper, synchronizer
            if self._wfp:
                self._logger.info('Terminating WFprocessor')
                self._wfp.terminate_processor()

            if self._task_manager:
                self._logger.info('Terminating task manager process')
                self._task_manager.terminate_manager()
                self._task_manager.terminate_heartbeat()

            if self._sync_thread:
                self._logger.info('Terminating synchronizer thread')
                self._terminate_sync.set()
                self._sync_thread.join()
                self._logger.info('Synchronizer thread terminated')

            if self._resource_manager:
                self._resource_manager._terminate_resource_request()

            self._prof.prof('termination done', uid=self._uid)

            raise KeyboardInterrupt

        except Exception, ex:

            self._prof.prof('start termination', uid=self._uid)

            self._logger.exception('Error in AppManager: %s' % ex)

            # Terminate threads in following order: wfp, helper, synchronizer
            if self._wfp:
                self._logger.info('Terminating WFprocessor')
                self._wfp.terminate_processor()

            if self._task_manager:
                self._logger.info('Terminating task manager process')
                self._task_manager.terminate_manager()
                self._task_manager.terminate_heartbeat()

            if self._sync_thread:
                self._logger.info('Terminating synchronizer thread')
                self._terminate_sync.set()
                self._sync_thread.join()
                self._logger.info('Synchronizer thread terminated')

            if self._resource_manager:
                self._resource_manager._terminate_resource_request()

            self._prof.prof('termination done', uid=self._uid)
            raise
Пример #14
0
def test_write_session_description():

    hostname = os.environ.get('RMQ_HOSTNAME', 'localhost')
    port = int(os.environ.get('RMQ_PORT', 5672))
    amgr = AppManager(hostname=hostname, port=port)
    amgr.resource_desc = {
        'resource': 'xsede.stampede',
        'walltime': 60,
        'cpus': 128,
        'gpus': 64,
        'project': 'xyz',
        'queue': 'high'
    }

    workflow = [generate_pipeline(1), generate_pipeline(2)]
    amgr.workflow = workflow

    amgr._wfp = WFprocessor(sid=amgr._sid,
                            workflow=amgr._workflow,
                            pending_queue=amgr._pending_queue,
                            completed_queue=amgr._completed_queue,
                            mq_hostname=amgr._mq_hostname,
                            port=amgr._port,
                            resubmit_failed=amgr._resubmit_failed)
    amgr._wfp._initialize_workflow()
    amgr._workflow = amgr._wfp.workflow

    amgr._task_manager = TaskManager(sid=amgr._sid,
                                     pending_queue=amgr._pending_queue,
                                     completed_queue=amgr._completed_queue,
                                     mq_hostname=amgr._mq_hostname,
                                     rmgr=amgr._resource_manager,
                                     port=amgr._port)

    # os.mkdir(amgr._sid)

    write_session_description(amgr)

    desc = ru.read_json('%s/radical.entk.%s.json' % (amgr._sid, amgr._sid))

    assert desc == {
        'config': {},
        'entities': {
            'appmanager': {
                'event_model': {},
                'state_model': None,
                'state_values': None
            },
            'pipeline': {
                'event_model': {},
                'state_model': {
                    'CANCELED': 9,
                    'DESCRIBED': 1,
                    'DONE': 9,
                    'FAILED': 9,
                    'SCHEDULING': 2
                },
                'state_values': {
                    '1': 'DESCRIBED',
                    '2': 'SCHEDULING',
                    '9': ['DONE', 'CANCELED', 'FAILED']
                }
            },
            'stage': {
                'event_model': {},
                'state_model': {
                    'CANCELED': 9,
                    'DESCRIBED': 1,
                    'DONE': 9,
                    'FAILED': 9,
                    'SCHEDULED': 3,
                    'SCHEDULING': 2
                },
                'state_values': {
                    '1': 'DESCRIBED',
                    '2': 'SCHEDULING',
                    '3': 'SCHEDULED',
                    '9': ['FAILED', 'CANCELED', 'DONE']
                }
            },
            'task': {
                'event_model': {},
                'state_model': {
                    'CANCELED': 9,
                    'DEQUEUED': 8,
                    'DEQUEUEING': 7,
                    'DESCRIBED': 1,
                    'DONE': 9,
                    'EXECUTED': 6,
                    'FAILED': 9,
                    'SCHEDULED': 3,
                    'SCHEDULING': 2,
                    'SUBMITTED': 5,
                    'SUBMITTING': 4
                },
                'state_values': {
                    '1': 'DESCRIBED',
                    '2': 'SCHEDULING',
                    '3': 'SCHEDULED',
                    '4': 'SUBMITTING',
                    '5': 'SUBMITTED',
                    '6': 'EXECUTED',
                    '7': 'DEQUEUEING',
                    '8': 'DEQUEUED',
                    '9': ['DONE', 'CANCELED', 'FAILED']
                }
            }
        },
        'tree': {
            'appmanager.0000': {
                'cfg': {},
                'children': [
                    'wfprocessor.0000', 'resource_manager.0000',
                    'task_manager.0000', 'pipeline.0000', 'pipeline.0001'
                ],
                'etype':
                'appmanager',
                'has': [
                    'pipeline', 'wfprocessor', 'resource_manager',
                    'task_manager'
                ],
                'uid':
                'appmanager.0000'
            },
            'pipeline.0000': {
                'cfg': {},
                'children': ['stage.0000', 'stage.0001'],
                'etype': 'pipeline',
                'has': ['stage'],
                'uid': 'pipeline.0000'
            },
            'pipeline.0001': {
                'cfg': {},
                'children': ['stage.0002', 'stage.0003'],
                'etype': 'pipeline',
                'has': ['stage'],
                'uid': 'pipeline.0001'
            },
            'resource_manager.0000': {
                'cfg': {},
                'children': [],
                'etype': 'resource_manager',
                'has': [],
                'uid': 'resource_manager.0000'
            },
            'stage.0000': {
                'cfg': {},
                'children': ['task.0000'],
                'etype': 'stage',
                'has': ['task'],
                'uid': 'stage.0000'
            },
            'stage.0001': {
                'cfg': {},
                'children': [
                    'task.0001', 'task.0002', 'task.0003', 'task.0004',
                    'task.0005', 'task.0006', 'task.0007', 'task.0008',
                    'task.0009', 'task.0010'
                ],
                'etype':
                'stage',
                'has': ['task'],
                'uid':
                'stage.0001'
            },
            'stage.0002': {
                'cfg': {},
                'children': ['task.0011'],
                'etype': 'stage',
                'has': ['task'],
                'uid': 'stage.0002'
            },
            'stage.0003': {
                'cfg': {},
                'children': [
                    'task.0012', 'task.0013', 'task.0014', 'task.0015',
                    'task.0016', 'task.0017', 'task.0018', 'task.0019',
                    'task.0020', 'task.0021'
                ],
                'etype':
                'stage',
                'has': ['task'],
                'uid':
                'stage.0003'
            },
            'task.0000': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0000'
            },
            'task.0001': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0001'
            },
            'task.0002': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0002'
            },
            'task.0003': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0003'
            },
            'task.0004': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0004'
            },
            'task.0005': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0005'
            },
            'task.0006': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0006'
            },
            'task.0007': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0007'
            },
            'task.0008': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0008'
            },
            'task.0009': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0009'
            },
            'task.0010': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0010'
            },
            'task.0011': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0011'
            },
            'task.0012': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0012'
            },
            'task.0013': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0013'
            },
            'task.0014': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0014'
            },
            'task.0015': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0015'
            },
            'task.0016': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0016'
            },
            'task.0017': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0017'
            },
            'task.0018': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0018'
            },
            'task.0019': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0019'
            },
            'task.0020': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0020'
            },
            'task.0021': {
                'cfg': {},
                'children': [],
                'etype': 'task',
                'has': [],
                'uid': 'task.0021'
            },
            'task_manager.0000': {
                'cfg': {},
                'children': [],
                'etype': 'task_manager',
                'has': [],
                'uid': 'task_manager.0000'
            },
            'wfprocessor.0000': {
                'cfg': {},
                'children': [],
                'etype': 'wfprocessor',
                'has': [],
                'uid': 'wfprocessor.0000'
            }
        }
    }

    shutil.rmtree(amgr._sid)
Пример #15
0
    def test_start_manager(self, mocked_init, mocked_Logger, mocked_Profiler):
        rmq_params = mock.MagicMock(spec=ConnectionParameters)
        rmgr = mock.MagicMock(spec=RPRmgr)
        tmgr = RPTmgr('test_tmgr', ['pending_queues'], ['completed_queues'],
                      rmgr, rmq_params)

        tmgr._log = mocked_Logger
        tmgr._prof = mocked_Profiler
        tmgr._uid = 'tmgr.0000'
        tmgr._rmgr = 'test_rmgr'
        tmgr._rmq_conn_params = rmq_params
        tmgr._pending_queue = ['pending_queues']
        tmgr._completed_queue = ['completed_queues']
        tmgr._tmgr = _tmgr_side_effect

        tmgr._tmgr_terminate = None
        tmgr._tmgr_process = None
        tmgr.start_manager()
        try:
            self.assertIsInstance(tmgr._tmgr_terminate, mp.synchronize.Event)
            self.assertIsInstance(tmgr._tmgr_process, mp.context.Process)
            pid = tmgr._tmgr_process.pid
            self.assertTrue(psutil.pid_exists(pid))
        finally:
            if tmgr._tmgr_process.is_alive():
                tmgr._tmgr_process.join()