Exemplo n.º 1
0
    def test_crud_execution(self):
        """It test basic CRUD operations of an Execution class"""

        # We verify that the object is not in the db after creating it

        execution = Execution()
        execution.execution_type = "execution_type"
        execution.status = "status"
        self.assertIsNone(execution.id)

        # We store the object in the db
        db.session.add(execution)

        # We recover the execution from the db
        execution = db.session.query(Execution).filter_by(
            execution_type="execution_type").first()
        self.assertIsNotNone(execution.id)
        self.assertEquals("execution_type", execution.execution_type)
        self.assertEquals("status", execution.status)

        # We check that we can update the execution
        execution.execution_type = "X"
        db.session.commit()
        execution_2 = db.session.query(Execution).filter_by(
            execution_type="X").first()
        self.assertEquals(execution.id, execution_2.id)
        self.assertEquals("X", execution.execution_type)

        # We check the delation
        db.session.delete(execution_2)
        count = db.session.query(Execution).filter_by(
            execution_type="X").count()
        self.assertEquals(0, count)
Exemplo n.º 2
0
def subscribe(actor_id, worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    actor_ch = ActorMsgChannel(actor_id)
    t = threading.Thread(target=process_worker_ch, args=(worker_ch, actor_id, actor_ch))
    t.start()
    print("Worker subscribing to actor channel...")
    while keep_running:
        update_worker_status(actor_id, worker_ch.name, READY)
        try:
            msg = actor_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        print("Received message {}. Starting actor container...".format(str(msg)))
        message = msg.pop("msg", "")
        try:
            stats, logs = execute_actor(actor_id, worker_ch, image, message, msg)
        except DockerStartContainerError as e:
            print("Got DockerStartContainerError: {}".format(str(e)))
            Actor.set_status(actor_id, ERROR)
            continue
        # add the execution to the actor store
        print("Actor container finished successfully. Got stats object:{}".format(str(stats)))
        exc_id = Execution.add_execution(actor_id, stats)
        Execution.set_logs(exc_id, logs)
Exemplo n.º 3
0
    def setUp(self):
        """
        It creates the memory db
        """

        db.create_all()

        # We store some Applications in the db for the tests
        application_1 = Application()
        application_1.name = 'AppName_1'
        application_2 = Application()
        application_2.name = 'AppName_2'

        # Adding executing scripts
        execution_script_1 = ExecutionConfiguration()
        execution_script_1.execution_type = "slurm:sbatch"
        execution_script_2 = ExecutionConfiguration()
        execution_script_2.execution_type = "slurm:sbatch2"
        application_2.execution_configurations = [
            execution_script_1, execution_script_2
        ]

        db.session.add(application_1)
        db.session.add(application_2)

        # We store some testbeds in the db for the tests
        testbed_1 = Testbed("name_1", True, "slurm", "ssh", "user@server",
                            ['slurm'])
        testbed_2 = Testbed("name_2", False, "slurm", "ssh", "user@server",
                            ['slurm'])
        testbed_3 = Testbed("name_3", True, "slurm", "ssh", "user@server",
                            ['slurm', 'slurm:singularity'])
        db.session.add(testbed_1)
        db.session.add(testbed_2)
        db.session.add(testbed_3)
        db.session.commit()

        deployment = Deployment()
        deployment.executable_id = execution_script_1.id
        deployment.testbed_id = testbed_1.id
        db.session.add(deployment)

        # We store some nodes in the db for the tests
        node_1 = Node()
        node_1.name = "node_1"
        node_1.information_retrieved = True
        node_2 = Node()
        node_2.name = "node_2"
        node_2.information_retrieved = False
        db.session.add(node_1)
        db.session.add(node_2)

        execution = Execution()
        execution.execution_type = "execution_type"
        execution.status = "status"
        db.session.add(execution)

        db.session.commit()
Exemplo n.º 4
0
    def test_initialization_execution(self):
        """Test the initializacion method of the class Execution"""

        execution = Execution()
        execution.execution_type = "execution_type"
        execution.status = "status"

        self.assertEquals("execution_type", execution.execution_type)
        self.assertEquals("status", execution.status)
Exemplo n.º 5
0
 def post(self, actor_id):
     try:
         actor = Actor.from_db(actors_store[actor_id])
     except KeyError:
         raise APIException(
             "actor not found: {}'".format(actor_id), 404)
     args = self.validate_post()
     Execution.add_execution(actor_id, args)
     return ok(result=actor, msg="Actor execution added successfully.")
Exemplo n.º 6
0
def add_resource(execution):
	"""
	it adds resources to a running execution

	    adapt_compss_resources <master_node> <master_job_id> CREATE SLURM-Cluster default <singularity_image> 
	"""

	if (( execution.execution_type == execute_type_singularity_pm)) :
		logging.info("Executing type corresponds with SINGULARITY_PM, trying adaptation")

		if (( execution.status == Execution.__status_running__)) :
			url = execution.execution_configuration.testbed.endpoint
			scaling_upper_bound = execution.execution_configuration.application.scaling_upper_bound
			enqueue_env_file = execution.execution_configuration.testbed.extra_config['enqueue_env_file']
			singularity_image_file = execution.execution_configuration.executable.singularity_image_file
			sbatch_id = execution.batch_id

			upper_bound_ok = True
			if ( scaling_upper_bound is not None ) and ( scaling_upper_bound != 0 ) :
				if scaling_upper_bound <= execution.get_number_extra_jobs() :
					upper_bound_ok = False

			if upper_bound_ok :
				node = find_first_node(sbatch_id, url)

				command = "source"
				params = []
				params.append(enqueue_env_file)
				params.append(";")
				params.append("adapt_compss_resources")
				params.append(node)
				params.append(sbatch_id)
				params.append('CREATE SLURM-Cluster default')
				params.append(singularity_image_file)
				output = shell.execute_command(command, url, params)

				job_name = parse_add_resource_output(output)
				print(job_name)
				time.sleep(2)
				extra_job_id = get_job_id_after_adaptation(job_name, url)
				print(extra_job_id)

				if extra_job_id != '' or extra_job_id is not None :
					child = Execution()
					child.status = Execution.__status_running__
					child.execution_type = execute_type_singularity_pm
					child.batch_id = extra_job_id
					execution.children.append(child)
					db.session.commit()
					time.sleep(5)
					__add_nodes_to_execution__(child, url)
			else :
				logging.info('Execution already reached its maximum number of extra jobs, no adaptation possible')
		else :
			logging.info("Execution is not in RUNNING status, no action can be done")
	else :
		logging.info("Execution: " + execution.execution_type + " it is not compatible with add resource action")
Exemplo n.º 7
0
 def post(self, actor_id):
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 404)
     args = self.validate_post()
     Execution.add_execution(id, args)
     return ok(result=actor.display(),
               msg="Actor execution added successfully.")
Exemplo n.º 8
0
    def test_child_parent_relationship(self):
        """
		It tests the child parent relationship between several executions
		"""

        parent = Execution()
        parent.status = "x1"
        db.session.add(parent)
        db.session.commit()

        # Empty list of children
        parent = db.session.query(Execution).filter_by(status="x1").first()
        self.assertEquals(0, len(parent.children))

        # We add childer
        child_1 = Execution()
        child_1.status = "x2"
        parent.children.append(child_1)

        child_2 = Execution()
        child_2.status = "x3"
        parent.children.append(child_2)

        db.session.commit()

        parent = db.session.query(Execution).filter_by(status="x1").first()
        self.assertEquals(2, len(parent.children))
        self.assertEquals(child_1, parent.children[0])
        self.assertEquals(child_2, parent.children[1])

        child_1 = db.session.query(Execution).filter_by(status="x2").first()
        self.assertEquals(parent, child_1.parent)

        child_2 = db.session.query(Execution).filter_by(status="x3").first()
        self.assertEquals(parent, child_2.parent)
Exemplo n.º 9
0
 def save_execution_plan(feature_id):
     feature = Feature.objects.get(pk=feature_id)
     if feature.executionLock:
         return "execution already started"
     else:
         feature.lock_feature()
         execution = Execution()
         workspace = WorkSpace.objects.get(pk=feature.workspace)
         execution.fill(workspace, "planed", "hardcode-executor")
         execution.save()
         return "ok"
Exemplo n.º 10
0
def createTables():
    try:
        Execution.create_table()
        print("Tabela 'Execution' criada com sucesso!")
    except peewee.OperationalError:
        print("Tabela 'Execution' ja existe!")
    try:
        ExecutionItem.create_table()
        print("Tabela 'ExecutionItem' criada com sucesso!")
    except peewee.OperationalError:
        print("Tabela 'ExecutionItem' ja existe!")
Exemplo n.º 11
0
 def post(self, actor_id):
     logger.debug("top of POST /actors/{}/executions".format(actor_id))
     id = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[id])
     except KeyError:
         logger.debug("did not find actor: {}.".format(actor_id))
         raise ResourceError(
             "No actor found with id: {}.".format(actor_id), 404)
     args = self.validate_post()
     logger.debug("execution post args validated: {}.".format(actor_id))
     Execution.add_execution(id, args)
     logger.info("execution added: {}.".format(actor_id))
     return ok(result=actor.display(), msg="Actor execution added successfully.")
Exemplo n.º 12
0
    def test_many_to_many_relations_with_nodes(self):
        """
		It tests the many to many relations with Nodes
		"""

        node_1 = Node()
        node_1.name = "node1"
        node_1.information_retrieved = False
        node_2 = Node()
        node_2.name = "node2"
        node_2.information_retrieved = False
        db.session.add(node_1)
        db.session.add(node_2)

        execution_1 = Execution()
        execution_1.status = "x1"
        execution_2 = Execution()
        execution_2.status = "x2"
        db.session.add(execution_1)
        db.session.add(execution_2)

        db.session.commit()

        execution_1.nodes = [node_1, node_2]
        execution_2.nodes = [node_2, node_1]

        db.session.commit()

        execution = db.session.query(Execution).filter_by(status="x1").first()
        self.assertEquals(node_1, execution.nodes[0])
        self.assertEquals(node_2, execution.nodes[1])

        execution = db.session.query(Execution).filter_by(status="x2").first()
        self.assertEquals(node_2, execution.nodes[0])
        self.assertEquals(node_1, execution.nodes[1])
Exemplo n.º 13
0
 def get(self, actor_id, execution_id):
     def get_hypermedia(actor, exc):
         return {'_links': {'self': '{}/actors/v2/{}/executions/{}/logs'.format(actor.api_server, actor.id, exc.id),
                            'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                            'execution': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc.id)},
                 }
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[dbid])
     except KeyError:
         raise APIException(
             "actor not found: {}'".format(actor_id), 404)
     try:
         excs = executions_store[dbid]
     except KeyError:
         raise APIException("No executions found for actor {}.".format(actor_id))
     try:
         exc = Execution.from_db(excs[execution_id])
     except KeyError:
         raise APIException("Execution not found {}.".format(execution_id))
     try:
         logs = logs_store[execution_id]
     except KeyError:
         logs = ""
     result={'logs': logs}
     result.update(get_hypermedia(actor, exc))
     return ok(result, msg="Logs retrieved successfully.")
Exemplo n.º 14
0
    def setUp(self):
        """
        It creates the model objects and saves then in the database
        """
        super(RankingTests, self).setUp()

        self.execution = Execution()
        self.execution.slurm_sbatch_id = 2333

        execution_configuration = ExecutionConfiguration()
        execution_configuration.id = 22
        self.execution.execution_configuration = execution_configuration

        application = Application()
        application.name = "Matmul"
        execution_configuration.application = application

        testbed = Testbed("nova", True, "SLURM", "SSH", "*****@*****.**",
                          ["SINGULARITY"])
        execution_configuration.testbed = testbed

        db.session.add(testbed)
        db.session.add(application)
        db.session.add(execution_configuration)
        db.session.add(self.execution)
        db.session.commit()
Exemplo n.º 15
0
 def get(self, actor_id, execution_id):
     def get_hypermedia(actor, exc):
         return {'_links': {'self': '{}/actors/v2/{}/executions/{}/logs'.format(actor.api_server, actor.id, exc.id),
                            'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                            'execution': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc.id)},
                 }
     logger.debug("top of GET /actors/{}/executions/{}/logs.".format(actor_id, execution_id))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actor = Actor.from_db(actors_store[dbid])
     except KeyError:
         logger.debug("did not find actor: {}.".format(actor_id))
         raise ResourceError(
             "No actor found with id: {}.".format(actor_id), 404)
     try:
         excs = executions_store[dbid]
     except KeyError:
         logger.debug("did not find executions. actor: {}.".format(actor_id))
         raise ResourceError("No executions found for actor {}.".format(actor_id))
     try:
         exc = Execution.from_db(excs[execution_id])
     except KeyError:
         logger.debug("did not find execution: {}. actor: {}.".format(execution_id, actor_id))
         raise ResourceError("Execution {} not found.".format(execution_id))
     try:
         logs = logs_store[execution_id]
     except KeyError:
         logger.debug("did not find logs. execution: {}. actor: {}.".format(execution_id, actor_id))
         logs = ""
     result={'logs': logs}
     result.update(get_hypermedia(actor, exc))
     return ok(result, msg="Logs retrieved successfully.")
Exemplo n.º 16
0
def execute_application_type_slurm_sbatch(execution, identifier):
    """
	Executes an application with a device supervisor configured
	for slurm sbatch
	"""

    execution_configuration, testbed, deployment, executable = __get_srun_info__(
        execution, identifier)

    if testbed.category != Testbed.slurm_category:
        # If the category is not SLURM we can not execute the app
        execution.status = execute_status_failed
        execution.output = "Testbed does not support " + execute_type_slurm_sbatch + " applications"
        db.session.commit()

    elif not testbed.on_line:
        # If the testbed is off-line is not SLURM we can not execute the app
        execution.status = execute_status_failed
        execution.output = "Testbed is off-line"
        db.session.commit()

    else:
        # Preparing the command to be executed
        command = "sbatch"
        endpoint = testbed.endpoint
        params = []
        params.append(executable.executable_file)

        logging.info("Launching execution of application: command: " +
                     command + " | endpoint: " + endpoint + " | params: " +
                     str(params))

        output = shell.execute_command(command, endpoint, params)
        print(output)

        sbatch_id = __extract_id_from_sbatch__(output)

        execution = Execution()
        execution.execution_type = execution_configuration.execution_type
        execution.status = Execution.__status_running__
        execution_configuration.executions.append(execution)
        execution.slurm_sbatch_id = sbatch_id
        db.session.commit()

        # Add nodes
        __add_nodes_to_execution__(execution, endpoint)
Exemplo n.º 17
0
    def post(self, actor_id):
        def get_hypermedia(actor, exc):
            return {
                '_links': {
                    'self':
                    '{}/actors/v2/{}/executions/{}'.format(
                        actor.api_server, actor.id, exc),
                    'owner':
                    '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                    'messages':
                    '{}/actors/v2/{}/messages'.format(actor.api_server,
                                                      actor.id)
                },
            }

        args = self.validate_post()
        d = {}
        # build a dictionary of k:v pairs from the query parameters, and pass a single
        # additional object 'message' from within the post payload. Note that 'message'
        # need not be JSON data.
        for k, v in request.args.items():
            if k == 'message':
                continue
            d[k] = v
        if hasattr(g, 'user'):
            d['_abaco_username'] = g.user
        if hasattr(g, 'api_server'):
            d['_abaco_api_server'] = g.api_server
        # if hasattr(g, 'jwt'):
        #     d['_abaco_jwt'] = g.jwt
        # if hasattr(g, 'jwt_server'):
        #     d['_abaco_jwt_server'] = g.jwt_server
        if hasattr(g, 'jwt_header_name'):
            d['_abaco_jwt_header_name'] = g.jwt_header_name
        dbid = Actor.get_dbid(g.tenant, actor_id)
        # create an execution
        exc = Execution.add_execution(
            dbid, {
                'cpu': 0,
                'io': 0,
                'runtime': 0,
                'status': SUBMITTED,
                'executor': g.user
            })
        d['_abaco_execution_id'] = exc
        d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '')
        ch = ActorMsgChannel(actor_id=dbid)
        ch.put_msg(message=args['message'], d=d)
        # make sure at least one worker is available
        actor = Actor.from_db(actors_store[dbid])
        actor.ensure_one_worker()
        result = {'execution_id': exc, 'msg': args['message']}
        result.update(get_hypermedia(actor, exc))
        case = Config.get('web', 'case')
        if not case == 'camel':
            return ok(result)
        else:
            return ok(dict_to_camel(result))
Exemplo n.º 18
0
def inserTablesData():
    execution_1 = Execution.create(input='JAN21-A', outputs='none')

    execution_2 = Execution.create(input='JAN21-B', outputs='noneA')

    execution_3 = Execution.create(input='JAN23-C', outputs='nonec')

    executionItem_1 = {
        'status': 'STARTED',
        'params': 'nada',
        'result': '',
        'error': '',
        'execution_id': execution_1
    }

    executionItem_2 = {
        'status': 'SUCCESS',
        'params': 'nada',
        'result': '',
        'error': '',
        'execution_id': execution_1
    }

    executionItem_3 = {
        'status': 'STARTED',
        'params': 'nada',
        'result': '',
        'error': '',
        'execution_id': execution_2
    }

    executionItem_4 = {
        'status': 'STARTED',
        'params': 'nada',
        'result': '',
        'error': '',
        'execution_id': execution_3
    }
    arrayOfExecutionItens = [
        executionItem_1, executionItem_2, executionItem_3, executionItem_4
    ]

    ExecutionItem.insert_many(arrayOfExecutionItens).execute()
Exemplo n.º 19
0
def __parse_output__(output, endpoint, execution_configuration, child_execution=None):
	"""
	It parses output and adds nodes to the execution
	"""

	sbatch_id = __extract_id_from_squeue__(output)
	execution = None
	
	if child_execution :
		execution = child_execution
	else :
		execution = Execution()
		execution.execution_type = execution_configuration.execution_type
		execution_configuration.executions.append(execution)
	
	execution.status = Execution.__status_running__
	execution.batch_id = sbatch_id
	db.session.commit()

	# Add nodes
	__add_nodes_to_execution__(execution, endpoint)
Exemplo n.º 20
0
def stop_execution(execution):
    """
	It stops a checkpointable execution
	"""

    if Application.CHECKPOINTABLE == execution.execution_configuration.application.application_type:
        child = None

        if execution.status == Execution.__status_running__:
            child = Execution()
            child.status = Execution.__status_running__
            child.execution_configuration = execution.execution_configuration
            child.execution_type = execution.execution_configuration.execution_type
            child.slurm_sbatch_id = execution.slurm_sbatch_id

            execution.slurm_sbatch_id = -1
            execution.children.append(child)
        else:
            child = next(
                filter(
                    lambda child: child.status == Execution.__status_running__,
                    execution.children))  # Only one execution can be running

        execution.status = Execution.__status_stopped__
        db.session.commit()

        cancel_execution(child,
                         execution.execution_configuration.testbed.endpoint)
    else:
        slurm.stop_execution(
            execution.slurm_sbatch_id,
            execution.execution_configuration.testbed.endpoint)
Exemplo n.º 21
0
    def post(self, actor_id):
        def get_hypermedia(actor, exc):
            return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc),
                               'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                               'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},}

        args = self.validate_post()
        d = {}
        # build a dictionary of k:v pairs from the query parameters, and pass a single
        # additional object 'message' from within the post payload. Note that 'message'
        # need not be JSON data.
        for k, v in request.args.items():
            if k == 'message':
                continue
            d[k] = v
        if hasattr(g, 'user'):
            d['_abaco_username'] = g.user
        if hasattr(g, 'api_server'):
            d['_abaco_api_server'] = g.api_server
        # if hasattr(g, 'jwt'):
        #     d['_abaco_jwt'] = g.jwt
        # if hasattr(g, 'jwt_server'):
        #     d['_abaco_jwt_server'] = g.jwt_server
        if hasattr(g, 'jwt_header_name'):
            d['_abaco_jwt_header_name'] = g.jwt_header_name
        dbid = Actor.get_dbid(g.tenant, actor_id)
        # create an execution
        exc = Execution.add_execution(dbid, {'cpu': 0,
                                             'io': 0,
                                             'runtime': 0,
                                             'status': SUBMITTED,
                                             'executor': g.user})
        d['_abaco_execution_id'] = exc
        d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '')
        ch = ActorMsgChannel(actor_id=dbid)
        ch.put_msg(message=args['message'], d=d)
        # make sure at least one worker is available
        workers = Worker.get_workers(dbid)
        actor = Actor.from_db(actors_store[dbid])
        if len(workers.items()) < 1:
            ch = CommandChannel()
            ch.put_cmd(actor_id=dbid, image=actor.image, tenant=g.tenant, num=1, stop_existing=False)
        result={'execution_id': exc, 'msg': args['message']}
        result.update(get_hypermedia(actor, exc))
        case = Config.get('web', 'case')
        if not case == 'camel':
            return ok(result)
        else:
            return ok(dict_to_camel(result))
Exemplo n.º 22
0
 def get(self, actor_id, execution_id):
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actors_store[dbid]
     except KeyError:
         raise APIException(
             "actor not found: {}'".format(actor_id), 404)
     try:
         excs = executions_store[dbid]
     except KeyError:
         raise APIException("No executions found for actor {}.".format(actor_id))
     try:
         exc = Execution.from_db(excs[execution_id])
     except KeyError:
         raise APIException("Execution not found {}.".format(execution_id))
     return ok(result=exc.display(), msg="Actor execution retrieved successfully.")
Exemplo n.º 23
0
 def get(self, actor_id, execution_id):
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actors_store[dbid]
     except KeyError:
         raise ResourceError("actor not found: {}'".format(actor_id), 404)
     try:
         excs = executions_store[dbid]
     except KeyError:
         raise ResourceError(
             "No executions found for actor {}.".format(actor_id))
     try:
         exc = Execution.from_db(excs[execution_id])
     except KeyError:
         raise ResourceError("Execution not found {}.".format(execution_id))
     return ok(result=exc.display(),
               msg="Actor execution retrieved successfully.")
Exemplo n.º 24
0
def execute_application(execution_configuration,
                        create_profile=False,
                        use_stored_profile=False):
    """
	This function executes an application in the selected testbed,
	using the execution script configuration.
	"""

    # We create the execution
    execution = Execution()
    execution.execution_type = execution_configuration.execution_type
    execution.status = execute_status_submitted

    profile_folder = app.config['APP_PROFILE_FOLDER']

    db.session.add(execution)

    db.session.commit()

    # We verify that we recoginze the type of execution
    if execution.execution_type == execute_type_slurm_sbatch:

        t = Thread(target=execute_application_type_slurm_sbatch,
                   args=(execution, execution_configuration.id))
        t.start()
        return t
    elif execution.execution_type == execute_type_singularity_pm:
        t = Thread(target=execute_application_type_singularity_pm,
                   args=(execution, execution_configuration.id, create_profile,
                         use_stored_profile, profile_folder))
        t.start()
        return t
    elif execution.execution_type == execute_type_singularity_srun:
        t = Thread(target=execute_application_type_singularity_srun,
                   args=(execution, execution_configuration.id))
        t.start()
        return t
    elif execution.execution_type == execute_type_slurm_srun:
        t = Thread(target=execute_application_type_slurm_srun,
                   args=(execution, execution_configuration.id))
        t.start()
        return t
    elif execution.execution_type == Executable.__type_pm__:
        t = Thread(target=execute_application_type_pm,
                   args=(execution, execution_configuration.id, create_profile,
                         use_stored_profile, profile_folder))
        t.start()
        return t
    else:
        execution.status = execute_status_failed
        execution.output = "No support for execurtion type: " + execution.execution_type
        db.session.commit()
Exemplo n.º 25
0
 def get(self, actor_id, execution_id):
     logger.debug("top of GET /actors/{}/executions/{}.".format(actor_id, execution_id))
     dbid = Actor.get_dbid(g.tenant, actor_id)
     try:
         actors_store[dbid]
     except KeyError:
         logger.debug("did not find actor: {}.".format(actor_id))
         raise ResourceError(
             "No actor found with id: {}.".format(actor_id), 404)
     try:
         excs = executions_store[dbid]
     except KeyError:
         logger.debug("did not find executions: {}.".format(actor_id))
         raise ResourceError("No executions found for actor {}.".format(actor_id))
     try:
         exc = Execution.from_db(excs[execution_id])
     except KeyError:
         logger.debug("did not find execution: {}. actor: {}.".format(execution_id,
                                                                      actor_id))
         raise ResourceError("Execution not found {}.".format(execution_id))
     return ok(result=exc.display(), msg="Actor execution retrieved successfully.")
Exemplo n.º 26
0
def process_link(link, msg, d):
    """
    Process an event with a link.
    :return: 
    """
    # ensure that the linked actor still exists; the link attribute is *always* the dbid of the linked
    # actor
    logger.debug("top of process_link")
    try:
        actors_store[link]
    except KeyError as e:
        logger.error(
            "Processing event message for actor {} that does not exist. Quiting"
            .format(link))
        raise e

    # create an execution for the linked actor with message
    exc = Execution.add_execution(
        link, {
            'cpu': 0,
            'io': 0,
            'runtime': 0,
            'status': SUBMITTED,
            'executor': 'Abaco Event'
        })
    logger.info(
        "Events processor agent added execution {} for actor {}".format(
            exc, link))
    d['_abaco_execution_id'] = exc
    logger.debug(
        "sending message to actor. Final message {} and message dictionary: {}"
        .format(msg, d))
    ch = ActorMsgChannel(actor_id=link)
    ch.put_msg(message=msg, d=d)
    ch.close()
    logger.info("link processed.")
Exemplo n.º 27
0
def restart_execution(execution):
	"""
	It stops a checkpointable execution
	"""
	
	# We create the execution
	child = Execution()
	child.execution_type = execution.execution_configuration.execution_type
	child.status = Execution.__status_submitted__
	execution.children.append(child)
	execution.status = Execution.__status_restarted__
	db.session.commit()
	
	if execution.execution_configuration.execution_type == execute_type_slurm_srun :
		execute_application_type_slurm_srun(child, execution.execution_configuration_id, True)
		child.status = Execution.__status_running__
		db.session.commit()

	else :
		child.status = Execution.__status_failed__
		db.session.commit()
Exemplo n.º 28
0
def subscribe(tenant, actor_id, image, worker_id, api_server, client_id,
              client_secret, access_token, refresh_token, worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also launches a separate thread which ultimately subscribes to the worker channel
    for future communications.
    :return:
    """
    logger.debug("Top of subscribe(). worker_id: {}".format(worker_id))
    actor_ch = ActorMsgChannel(actor_id)
    # establish configs for this worker -------
    try:
        leave_containers = Config.get('workers', 'leave_containers')
    except configparser.NoOptionError:
        logger.debug("No leave_containers value configured.")
        leave_containers = False
    if hasattr(leave_containers, 'lower'):
        leave_containers = leave_containers.lower() == "true"
    logger.debug("leave_containers: {}".format(leave_containers))

    try:
        mem_limit = Config.get('workers', 'mem_limit')
    except configparser.NoOptionError:
        logger.debug("No mem_limit value configured.")
        mem_limit = "-1"
    mem_limit = str(mem_limit)

    try:
        max_cpus = Config.get('workers', 'max_cpus')
    except configparser.NoOptionError:
        logger.debug("No max_cpus value configured.")
        max_cpus = "-1"

    logger.debug("max_cpus: {}".format(max_cpus))

    # instantiate an OAuth client python object if credentials were passed -----
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        logger.info("Creating agave client.")
        verify = get_tenant_verify(tenant)
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret,
                   verify=verify)
    else:
        logger.info("Not creating agave client.")

    # start a separate thread for handling messages sent to the worker channel ----
    logger.info("Starting the process worker channel thread.")
    t = threading.Thread(target=process_worker_ch,
                         args=(tenant, worker_ch, actor_id, worker_id,
                               actor_ch, ag))
    t.start()

    # subscribe to the actor message queue -----
    logger.info(
        "Worker subscribing to actor channel. worker_id: {}".format(worker_id))
    # keep track of whether we need to update the worker's status back to READY; otherwise, we
    # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s)
    update_worker_status = True

    # global tracks whether this worker should keep running.
    globals.keep_running = True

    # consecutive_errors tracks the number of consecutive times a worker has gotten an error trying to process a
    # message. Even though the message will be requeued, we do not want the worker to continue processing
    # indefinitely when a compute node is unhealthy.
    consecutive_errors = 0

    # main subscription loop -- processing messages from actor's mailbox
    while globals.keep_running:
        logger.debug("top of keep_running; worker id: {}".format(worker_id))
        if update_worker_status:
            Worker.update_worker_status(actor_id, worker_id, READY)
            logger.debug(
                "updated worker status to READY in SUBSCRIBE; worker id: {}".
                format(worker_id))
            update_worker_status = False
        try:
            msg, msg_obj = actor_ch.get_one()
        except channelpy.ChannelClosedException:
            logger.info("Channel closed, worker exiting. worker id: {}".format(
                worker_id))
            globals.keep_running = False
            sys.exit()
        logger.info("worker {} processing new msg.".format(worker_id))

        try:
            Worker.update_worker_status(actor_id, worker_id, BUSY)
        except Exception as e:
            logger.error(
                "unexpected exception from call to update_worker_status. Nacking message."
                "actor_id: {}; worker_id: {}; status: {}; exception: {}".
                format(actor_id, worker_id, BUSY, e))
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            msg_obj.nack(requeue=True)
            raise e
        update_worker_status = True
        logger.info(
            "Received message {}. Starting actor container. worker id: {}".
            format(msg, worker_id))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        try:
            actor = Actor.from_db(actors_store[actor_id])
            execution_id = msg['_abaco_execution_id']
            content_type = msg['_abaco_Content_Type']
            mounts = actor.mounts
            logger.debug("actor mounts: {}".format(mounts))
        except Exception as e:
            logger.error(
                "unexpected exception retrieving actor, execution, content-type, mounts. Nacking message."
                "actor_id: {}; worker_id: {}; status: {}; exception: {}".
                format(actor_id, worker_id, BUSY, e))
            msg_obj.nack(requeue=True)
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            raise e

        # for results, create a socket in the configured directory.
        try:
            socket_host_path_dir = Config.get('workers',
                                              'socket_host_path_dir')
        except (configparser.NoSectionError, configparser.NoOptionError) as e:
            logger.error(
                "No socket_host_path configured. Cannot manage results data. Nacking message"
            )
            Actor.set_status(
                actor_id,
                ERROR,
                status_message="Abaco instance not configured for results data."
            )
            msg_obj.nack(requeue=True)
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            raise e
        socket_host_path = '{}.sock'.format(
            os.path.join(socket_host_path_dir, worker_id, execution_id))
        logger.info("Create socket at path: {}".format(socket_host_path))
        # add the socket as a mount:
        mounts.append({
            'host_path': socket_host_path,
            'container_path': '/_abaco_results.sock',
            'format': 'ro'
        })
        # for binary data, create a fifo in the configured directory. The configured
        # fifo_host_path_dir is equal to the fifo path in the worker container:
        fifo_host_path = None
        if content_type == 'application/octet-stream':
            try:
                fifo_host_path_dir = Config.get('workers',
                                                'fifo_host_path_dir')
            except (configparser.NoSectionError,
                    configparser.NoOptionError) as e:
                logger.error(
                    "No fifo_host_path configured. Cannot manage binary data.")
                Actor.set_status(
                    actor_id,
                    ERROR,
                    status_message=
                    "Abaco instance not configured for binary data. Nacking message."
                )
                msg_obj.nack(requeue=True)
                logger.info("worker exiting. worker_id: {}".format(worker_id))
                raise e
            fifo_host_path = os.path.join(fifo_host_path_dir, worker_id,
                                          execution_id)
            try:
                os.mkfifo(fifo_host_path)
                logger.info("Created fifo at path: {}".format(fifo_host_path))
            except Exception as e:
                logger.error(
                    "Could not create fifo_path. Nacking message. Exception: {}"
                    .format(e))
                msg_obj.nack(requeue=True)
                logger.info("worker exiting. worker_id: {}".format(worker_id))
                raise e
            # add the fifo as a mount:
            mounts.append({
                'host_path': fifo_host_path,
                'container_path': '/_abaco_binary_data',
                'format': 'ro'
            })

        # the execution object was created by the controller, but we need to add the worker id to it now that we
        # know which worker will be working on the execution.
        logger.debug(
            "Adding worker_id to execution. woker_id: {}".format(worker_id))
        try:
            Execution.add_worker_id(actor_id, execution_id, worker_id)
        except Exception as e:
            logger.error(
                "Unexpected exception adding working_id to the Execution. Nacking message. Exception: {}"
                .format(e))
            msg_obj.nack(requeue=True)
            logger.info("worker exiting. worker_id: {}".format(worker_id))
            raise e

        # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted.
        privileged = False
        if type(actor['privileged']) == bool and actor['privileged']:
            privileged = True
        logger.debug("privileged: {}; worker_id: {}".format(
            privileged, worker_id))

        # overlay resource limits if set on actor:
        if actor.mem_limit:
            mem_limit = actor.mem_limit
        if actor.max_cpus:
            max_cpus = actor.max_cpus

        # retrieve the default environment registered with the actor.
        environment = actor['default_environment']
        logger.debug("Actor default environment: {}".format(environment))

        # construct the user field from the actor's uid and gid:
        user = get_container_user(actor)
        logger.debug("Final user valiue: {}".format(user))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        environment['_abaco_actor_dbid'] = actor_id
        environment['_abaco_actor_id'] = actor.id
        environment['_abaco_worker_id'] = worker_id
        environment['_abaco_container_repo'] = actor.image
        environment['_abaco_actor_state'] = actor.state
        environment['_abaco_actor_name'] = actor.name or 'None'
        logger.debug("Overlayed environment: {}; worker_id: {}".format(
            environment, worker_id))

        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                logger.info(
                    "Refreshed the tokens. Passed {} to the environment.".
                    format(token))
            except Exception as e:
                logger.error(
                    "Got an exception trying to get an access token. Stoping worker and nacking message. "
                    "Exception: {}".format(e))
                msg_obj.nack(requeue=True)
                logger.info("worker exiting. worker_id: {}".format(worker_id))
                raise e
        else:
            logger.info(
                "Agave client `ag` is None -- not passing access token; worker_id: {}"
                .format(worker_id))
        logger.info("Passing update environment: {}".format(environment))
        logger.info("About to execute actor; worker_id: {}".format(worker_id))
        try:
            stats, logs, final_state, exit_code, start_time = execute_actor(
                actor_id, worker_id, execution_id, image, message, user,
                environment, privileged, mounts, leave_containers,
                fifo_host_path, socket_host_path, mem_limit, max_cpus)
        except DockerStartContainerError as e:
            logger.error(
                "Worker {} got DockerStartContainerError: {} trying to start actor for execution {}."
                "Placing message back on queue.".format(
                    worker_id, e, execution_id))
            # if we failed to start the actor container, we leave the worker up and re-queue the original message
            msg_obj.nack(requeue=True)
            logger.debug('message requeued.')
            consecutive_errors += 1
            if consecutive_errors > MAX_WORKER_CONSECUTIVE_ERRORS:
                logger.error(
                    "Worker {} failed to successfully start actor for execution {} {} consecutive times; "
                    "Exception: {}. Putting the actor in error status and shutting "
                    "down workers.".format(worker_id, execution_id,
                                           MAX_WORKER_CONSECUTIVE_ERRORS, e))
                Actor.set_status(actor_id, ERROR,
                                 "Error executing container: {}; w".format(e))
                shutdown_workers(actor_id, delete_actor_ch=False)
                # wait for worker to be shutdown..
                time.sleep(60)
                break
            else:
                # sleep five seconds before getting a message again to give time for the compute
                # node and/or docker health to recover
                time.sleep(5)
                continue
        except DockerStopContainerError as e:
            logger.error(
                "Worker {} was not able to stop actor for execution: {}; Exception: {}. "
                "Putting the actor in error status and shutting down workers.".
                format(worker_id, execution_id, e))
            Actor.set_status(actor_id, ERROR,
                             "Error executing container: {}".format(e))
            # since the error was with stopping the actor, we will consider this message "processed"; this choice
            # could be reconsidered/changed
            msg_obj.ack()
            shutdown_workers(actor_id, delete_actor_ch=False)
            # wait for worker to be shutdown..
            time.sleep(60)
            break
        except Exception as e:
            logger.error(
                "Worker {} got an unexpected exception trying to run actor for execution: {}."
                "Putting the actor in error status and shutting down workers. "
                "Exception: {}; type: {}".format(worker_id, execution_id, e,
                                                 type(e)))
            Actor.set_status(actor_id, ERROR,
                             "Error executing container: {}".format(e))
            # the execute_actor function raises a DockerStartContainerError if it met an exception before starting the
            # actor container; if the container was started, then another exception should be raised. Therefore,
            # we can assume here that the container was at least started and we can ack the message.
            msg_obj.ack()
            shutdown_workers(actor_id, delete_actor_ch=False)
            # wait for worker to be shutdown..
            time.sleep(60)
            break
        # ack the message
        msg_obj.ack()
        logger.debug(
            "container finished successfully; worker_id: {}".format(worker_id))
        # Add the completed stats to the execution
        logger.info(
            "Actor container finished successfully. Got stats object:{}".
            format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats,
                                     final_state, exit_code, start_time)
        logger.info("Added execution: {}; worker_id: {}".format(
            execution_id, worker_id))

        # Add the logs to the execution
        try:
            Execution.set_logs(execution_id, logs)
            logger.debug("Successfully added execution logs.")
        except Exception as e:
            msg = "Got exception trying to set logs for exception {}; " \
                  "Exception: {}; worker_id: {}".format(execution_id, e, worker_id)
            logger.error(msg)

        # Update the worker's last updated and last execution fields:
        try:
            Worker.update_worker_execution_time(actor_id, worker_id)
            logger.debug("worker execution time updated. worker_id: {}".format(
                worker_id))
        except KeyError:
            # it is possible that this worker was sent a gracful shutdown command in the other thread
            # and that spawner has already removed this worker from the store.
            logger.info(
                "worker {} got unexpected key error trying to update its execution time. "
                "Worker better be shutting down! keep_running: {}".format(
                    worker_id, globals.keep_running))
            if globals.keep_running:
                logger.error(
                    "worker couldn't update's its execution time but keep_running is still true!"
                )

        # we completed an execution successfully; reset the consecutive_errors counter
        consecutive_errors = 0
        logger.info(
            "worker time stamps updated; worker_id: {}".format(worker_id))
    logger.info(
        "global.keep_running no longer true. worker is now exited. worker id: {}"
        .format(worker_id))
Exemplo n.º 29
0
def __execute_pm_applications__(execution, identifier, create_profile,
                                use_storage_profile, profile_folder,
                                singularity):
    """
	It executes a Singularity PM application in a targatted testbed
	"""

    # If create_profile = True we need to create a profile and associate it with the execution
    profile_file = ''
    if create_profile:
        profile_file = profile_folder + '/' + str(uuid.uuid4()) + '.profile'

    # Lets recover all the information needed...execution_configuration
    execution_configuration = db.session.query(
        ExecutionConfiguration).filter_by(id=identifier).first(
        )  # This is to avoid reusing objects from other thread
    testbed = db.session.query(Testbed).filter_by(
        id=execution_configuration.testbed_id).first()
    deployment = db.session.query(Deployment).filter_by(
        executable_id=execution_configuration.executable_id,
        testbed_id=testbed.id).first()
    executable = db.session.query(Executable).filter_by(
        id=execution_configuration.executable_id).first()

    # Preparing the command to be executed
    command = "source"
    endpoint = testbed.endpoint
    params = []
    params.append(testbed.extra_config['enqueue_env_file'])
    params.append(";")
    params.append("enqueue_compss")
    params.append("--sc_cfg=" + testbed.extra_config['enqueue_compss_sc_cfg'])
    params.append("--num_nodes=" + str(execution_configuration.num_nodes))
    params.append("--gpus_per_node=" +
                  str(execution_configuration.num_gpus_per_node))
    params.append("--cpus_per_node=" +
                  str(execution_configuration.num_cpus_per_node))

    if singularity:
        params.append("--container_image=" + deployment.path)
        params.append(
            "--container_compss_path=/opt/TANGO/TANGO_ProgrammingModel/COMPSs/"
        )  # TODO Ugly... ugly... and more ugly...
        #params.append("--appdir=" + executable.singularity_app_folder)
        params.append(
            "--appdir=/apps/application/")  # TODO Ugly... fix this...
    else:
        params.append("--appdir=" + executable.singularity_app_folder)
    params.append("--exec_time=" + str(execution_configuration.exec_time))

    # If create profile
    if create_profile:
        params.append("--output_profile=" + profile_file)
    # If we use a profile  --output_profile=<path>
    if use_storage_profile:
        params.append("--input_profile=" +
                      execution_configuration.profile_file)
    params.append(execution_configuration.compss_config)
    params.append(execution_configuration.command)

    logging.info("Launching execution of application: command: " + command +
                 " | endpoint: " + endpoint + " | params: " + str(params))

    output = shell.execute_command(command, endpoint, params)
    sbatch_id = __extract_id_from_sigularity_pm_app__(output)

    execution = Execution()
    execution.execution_type = execution_configuration.execution_type
    execution.status = Execution.__status_running__
    execution_configuration.executions.append(execution)
    # if we create the profile, we add it to the execution configuration
    if create_profile:
        execution_configuration.profile_file = profile_file
    execution.slurm_sbatch_id = sbatch_id
    db.session.commit()

    # Add nodes
    time.sleep(5)
    __add_nodes_to_execution__(execution, endpoint)
Exemplo n.º 30
0
def subscribe(tenant,
              actor_id,
              worker_id,
              api_server,
              client_id,
              client_secret,
              access_token,
              refresh_token,
              worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    logger.debug("Top of subscribe().")
    actor_ch = ActorMsgChannel(actor_id)
    try:
        leave_containers = Config.get('workers', 'leave_containers')
    except configparser.NoOptionError:
        logger.info("No leave_containers value confiured.")
        leave_containers = False
    if hasattr(leave_containers, 'lower'):
        leave_containers = leave_containers.lower() == "true"
    logger.info("leave_containers: {}".format(leave_containers))
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        logger.info("Creating agave client.")
        verify = get_tenant_verify(tenant)
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret,
                   verify=verify)
    else:
        logger.info("Not creating agave client.")
    logger.info("Starting the process worker channel thread.")
    t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag))
    t.start()
    logger.info("Worker subscribing to actor channel.")

    # keep track of whether we need to update the worker's status back to READY; otherwise, we
    # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s)
    update_worker_status = True

    # shared global tracking whether this worker should keep running; shared between this thread and
    # the "worker channel processing" thread.
    global keep_running

    # main subscription loop -- processing messages from actor's mailbox
    while keep_running:
        if update_worker_status:
            Worker.update_worker_status(actor_id, worker_id, READY)
            update_worker_status = False
        try:
            msg = actor_ch.get_one()
        except channelpy.ChannelClosedException:
            logger.info("Channel closed, worker exiting...")
            keep_running = False
            sys.exit()
        logger.info("worker {} processing new msg.".format(worker_id))
        try:
            Worker.update_worker_status(actor_id, worker_id, BUSY)
        except Exception as e:
            logger.error("unexpected exception from call to update_worker_status."
                         "actor_id: {}; worker_id: {}; status: {}; exception: {}".format(actor_id,
                                                                                         worker_id,
                                                                                         BUSY,
                                                                                         e))
            raise e
        update_worker_status = True
        logger.info("Received message {}. Starting actor container...".format(msg))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        actor = Actor.from_db(actors_store[actor_id])
        execution_id = msg['_abaco_execution_id']
        content_type = msg['_abaco_Content_Type']
        mounts = actor.mounts
        logger.debug("actor mounts: {}".format(mounts))
        # for results, create a socket in the configured directory.
        try:
            socket_host_path_dir = Config.get('workers', 'socket_host_path_dir')
        except (configparser.NoSectionError, configparser.NoOptionError):
            logger.error("No socket_host_path configured. Cannot manage results data.")
            Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for results data.")
            continue
        socket_host_path = '{}.sock'.format(os.path.join(socket_host_path_dir, worker_id, execution_id))
        logger.info("Create socket at path: {}".format(socket_host_path))
        # add the socket as a mount:
        mounts.append({'host_path': socket_host_path,
                       'container_path': '/_abaco_results.sock',
                       'format': 'ro'})
        # for binary data, create a fifo in the configured directory. The configured
        # fifo_host_path_dir is equal to the fifo path in the worker container:
        fifo_host_path = None
        if content_type == 'application/octet-stream':
            try:
                fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir')
            except (configparser.NoSectionError, configparser.NoOptionError):
                logger.error("No fifo_host_path configured. Cannot manage binary data.")
                Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for binary data.")
                continue
            fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id)
            try:
                os.mkfifo(fifo_host_path)
                logger.info("Created fifo at path: {}".format(fifo_host_path))
            except Exception as e:
                logger.error("Could not create fifo_path. Exception: {}".format(e))
                raise e
            # add the fifo as a mount:
            mounts.append({'host_path': fifo_host_path,
                           'container_path': '/_abaco_binary_data',
                           'format': 'ro'})

        # the execution object was created by the controller, but we need to add the worker id to it now that we
        # know which worker will be working on the execution.
        logger.debug("Adding worker_id to execution.")
        Execution.add_worker_id(actor_id, execution_id, worker_id)

        # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted.
        privileged = False
        if type(actor['privileged']) == bool and actor['privileged']:
            privileged = True
        logger.debug("privileged: {}".format(privileged))

        # retrieve the default environment registered with the actor.
        environment = actor['default_environment']
        logger.debug("Actor default environment: {}".format(environment))

        # construct the user field from the actor's uid and gid:
        user = get_container_user(actor)
        logger.debug("Final user valiue: {}".format(user))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        environment['_abaco_actor_dbid'] = actor_id
        environment['_abaco_actor_id'] = actor.id
        environment['_abaco_actor_state'] = actor.state
        logger.debug("Overlayed environment: {}".format(environment))

        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                logger.info("Refreshed the tokens. Passed {} to the environment.".format(token))
            except Exception as e:
                logger.error("Got an exception trying to get an access token: {}".format(e))
        else:
            logger.info("Agave client `ag` is None -- not passing access token.")
        logger.info("Passing update environment: {}".format(environment))
        try:
            stats, logs, final_state, exit_code, start_time = execute_actor(actor_id,
                                                                            worker_id,
                                                                            execution_id,
                                                                            image,
                                                                            message,
                                                                            user,
                                                                            environment,
                                                                            privileged,
                                                                            mounts,
                                                                            leave_containers,
                                                                            fifo_host_path,
                                                                            socket_host_path)
        except DockerStartContainerError as e:
            logger.error("Got DockerStartContainerError: {}".format(e))
            Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e))
            continue
        # Add the completed stats to the execution
        logger.info("Actor container finished successfully. Got stats object:{}".format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time)
        logger.info("Added execution: {}".format(execution_id))

        # Add the logs to the execution
        Execution.set_logs(execution_id, logs)
        logger.info("Added execution logs.")

        # Update the worker's last updated and last execution fields:
        try:
            Worker.update_worker_execution_time(actor_id, worker_id)
        except KeyError:
            # it is possible that this worker was sent a gracful shutdown command in the other thread
            # and that spawner has already removed this worker from the store.
            logger.info("worker {} got unexpected key error trying to update its execution time. "
                        "Worker better be shutting down! keep_running: {}".format(worker_id, keep_running))
            if keep_running:
                logger.error("worker couldn't update's its execution time but keep_running is still true!")

        logger.info("worker time stamps updated.")
Exemplo n.º 31
0
Arquivo: worker.py Projeto: TACC/abaco
def subscribe(tenant,
              actor_id,
              api_server,
              client_id,
              client_secret,
              access_token,
              refresh_token,
              worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    actor_ch = ActorMsgChannel(actor_id)
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret)
    else:
        print("Not creating agave client.")
    t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, actor_ch, ag))
    t.start()
    print("Worker subscribing to actor channel...")
    global keep_running
    while keep_running:
        Worker.update_worker_status(actor_id, worker_ch.name, READY)
        try:
            msg = actor_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        except channelpy.ChannelClosedException:
            print("Channel closed, worker exiting...")
            keep_running = False
            sys.exit()
        print("Received message {}. Starting actor container...".format(str(msg)))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        actor = Actor.from_db(actors_store[actor_id])
        execution_id = msg['_abaco_execution_id']
        privileged = False
        if actor['privileged'] == 'TRUE':
            privileged = True
        environment = actor['default_environment']
        print("Actor default environment: {}".format(environment))
        print("Actor privileged: {}".format(privileged))
        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                print("Refreshed the tokens. Passed {} to the environment.".format(token))
            except Exception as e:
                print("Got an exception trying to get an access token: {}".format(e))
        else:
            print("Agave client `ag` is None -- not passing access token.")
        print("Passing update environment: {}".format(environment))
        try:
            stats, logs = execute_actor(actor_id, worker_ch, image, message,
                                        environment, privileged)
        except DockerStartContainerError as e:
            print("Got DockerStartContainerError: {}".format(str(e)))
            Actor.set_status(actor_id, ERROR)
            continue
        # add the execution to the actor store
        print("Actor container finished successfully. Got stats object:{}".format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats)
        print("Added execution: {}".format(execution_id))
        Execution.set_logs(execution_id, logs)
        Worker.update_worker_execution_time(actor_id, worker_ch.name)
Exemplo n.º 32
0
    def test_patch_execution_preprocessor(self, mock_restart_execution, mock_executor_stop, mock_executor_cancel, mock_executor_add, mock_executor_remove):
        """
        It test the correct work of the method of canceling an execution
        """

        # First we verify that nothing happens if launch_execution = False
        data = {'status': 'PEPITO'}

        response = self.client.patch("/api/v1/executions/100",
                                     data=json.dumps(data),
                                     content_type='application/json')

        self.assertEquals(409, response.status_code)
        self.assertEquals(
          'No execution by the given id',
          response.json['message'])

        # Preparing the data for the rest of the test
        testbed = Testbed("name", False, "slurm", "ssh", "user@server", ['slurm'])
        db.session.add(testbed)
        db.session.commit()
        application = Application()
        application.name = "xxx"
        application.application_type = "XXX"
        db.session.add(application)
        db.session.commit()
        execution_configuration = ExecutionConfiguration()
        execution_configuration.testbed = testbed
        execution_configuration.application = application
        db.session.add(execution_configuration)
        db.session.commit()
        execution = Execution()
        execution.execution_type = Executable.__type_singularity_srun__
        execution.status = Execution.__status_running__
        execution.execution_configuration = execution_configuration
       
        db.session.add(execution)
        db.session.commit()

        response = self.client.patch("/api/v1/executions/" + str(execution.id) ,
                                     data=json.dumps(data),
                                     content_type='application/json')
        self.assertEquals(409, response.status_code)
        self.assertEquals(
          'No valid state to try to change',
          response.json['message'])

        data = {'PEPITO': 'PEPITO'}
        response = self.client.patch("/api/v1/executions/" + str(execution.id) ,
                                     data=json.dumps(data),
                                     content_type='application/json')

        self.assertEquals(409, response.status_code)
        self.assertEquals(
          'No status, remove_resource, or add_resource field in the payload',
          response.json['message'])

        data = {'status': 'CANCEL'}
        response = self.client.patch("/api/v1/executions/" + str(execution.id) ,
                                     data=json.dumps(data),
                                     content_type='application/json')

        self.assertEquals(200, response.status_code)
        mock_executor_cancel.assert_called_with(execution, 'user@server')

        data = {'add_resource': ''}
        response = self.client.patch("/api/v1/executions/" + str(execution.id) ,
                                     data=json.dumps(data),
                                     content_type='application/json')

        mock_executor_add.assert_called_with(execution)

        data = {'remove_resource': ''}
        response = self.client.patch("/api/v1/executions/" + str(execution.id) ,
                                     data=json.dumps(data),
                                     content_type='application/json')

        mock_executor_remove.assert_called_with(execution)

        # Adding Checkpointable changes of status at ALDE level.
        execution.status = Execution.__status_running__
        application.application_type = Application.CHECKPOINTABLE
        db.session.commit()

        data = {'status': 'STOP'}
        response = self.client.patch("/api/v1/executions/" + str(execution.id),
                                    data=json.dumps(data),
                                    content_type="application/json")
        
        mock_executor_stop.assert_called_with(execution)

        execution.status = Execution.__status_cancel__
        db.session.commit()
        response = self.client.patch("/api/v1/executions/" + str(execution.id),
                                    data=json.dumps(data),
                                    content_type="application/json")
        self.assertEquals(409, response.status_code)
        self.assertEquals(
          'Execution is not in right state',
          response.json['message'])

        # Checkpointable restart
        execution.status = Execution.__status_stopped__
        db.session.commit()
        data = {'status': 'RESTART'}
        
        response = self.client.patch("/api/v1/executions/" + str(execution.id),
                                    data=json.dumps(data),
                                    content_type="application/json")
        
        mock_restart_execution.assert_called_with(execution)

        execution.status = Execution.__status_cancel__
        db.session.commit()
        response = self.client.patch("/api/v1/executions/" + str(execution.id),
                                    data=json.dumps(data),
                                    content_type="application/json")
        self.assertEquals(409, response.status_code)
        self.assertEquals(
          'Execution is not in right state',
          response.json['message'])
Exemplo n.º 33
0
    def post(self, actor_id):
        def get_hypermedia(actor, exc):
            return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc),
                               'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner),
                               'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},}

        logger.debug("top of POST /actors/{}/messages.".format(actor_id))
        dbid = Actor.get_dbid(g.tenant, actor_id)
        try:
            Actor.from_db(actors_store[dbid])
        except KeyError:
            logger.debug("did not find actor: {}.".format(actor_id))
            raise ResourceError("No actor found with id: {}.".format(actor_id), 404)
        args = self.validate_post()
        d = {}
        # build a dictionary of k:v pairs from the query parameters, and pass a single
        # additional object 'message' from within the post payload. Note that 'message'
        # need not be JSON data.
        logger.debug("POST body validated. actor: {}.".format(actor_id))
        for k, v in request.args.items():
            if k == 'message':
                continue
            d[k] = v
        logger.debug("extra fields added to message from query parameters: {}.".format(d))
        if hasattr(g, 'user'):
            d['_abaco_username'] = g.user
            logger.debug("_abaco_username: {} added to message.".format(g.user))
        if hasattr(g, 'api_server'):
            d['_abaco_api_server'] = g.api_server
            logger.debug("_abaco_api_server: {} added to message.".format(g.api_server))
        # if hasattr(g, 'jwt'):
        #     d['_abaco_jwt'] = g.jwt
        # if hasattr(g, 'jwt_server'):
        #     d['_abaco_jwt_server'] = g.jwt_server
        if hasattr(g, 'jwt_header_name'):
            d['_abaco_jwt_header_name'] = g.jwt_header_name
            logger.debug("abaco_jwt_header_name: {} added to message.".format(g.jwt_header_name))

        # create an execution
        exc = Execution.add_execution(dbid, {'cpu': 0,
                                             'io': 0,
                                             'runtime': 0,
                                             'status': SUBMITTED,
                                             'executor': g.user})
        logger.info("Execution {} added for actor {}".format(exc, actor_id))
        d['_abaco_execution_id'] = exc
        d['_abaco_Content_Type'] = args.get('_abaco_Content_Type', '')
        logger.debug("Final message dictionary: {}".format(d))
        ch = ActorMsgChannel(actor_id=dbid)
        ch.put_msg(message=args['message'], d=d)
        ch.close()
        logger.debug("Message added to actor inbox. id: {}.".format(actor_id))
        # make sure at least one worker is available
        actor = Actor.from_db(actors_store[dbid])
        actor.ensure_one_worker()
        logger.debug("ensure_one_worker() called. id: {}.".format(actor_id))
        if args.get('_abaco_Content_Type') == 'application/octet-stream':
            result = {'execution_id': exc, 'msg': 'binary - omitted'}
        else:
            result={'execution_id': exc, 'msg': args['message']}
        result.update(get_hypermedia(actor, exc))
        case = Config.get('web', 'case')
        if not case == 'camel':
            return ok(result)
        else:
            return ok(dict_to_camel(result))
Exemplo n.º 34
0
def subscribe(tenant, actor_id, worker_id, api_server, client_id,
              client_secret, access_token, refresh_token, worker_ch):
    """
    Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor
    containers when message arrive. Also subscribes to the worker channel for future communications.
    :return:
    """
    logger.debug("Top of subscribe().")
    actor_ch = ActorMsgChannel(actor_id)
    ag = None
    if api_server and client_id and client_secret and access_token and refresh_token:
        logger.info("Creating agave client.")
        ag = Agave(api_server=api_server,
                   token=access_token,
                   refresh_token=refresh_token,
                   api_key=client_id,
                   api_secret=client_secret)
    else:
        logger.info("Not creating agave client.")
    logger.info("Starting the process worker channel thread.")
    t = threading.Thread(target=process_worker_ch,
                         args=(tenant, worker_ch, actor_id, worker_id,
                               actor_ch, ag))
    t.start()
    logger.info("Worker subscribing to actor channel.")
    update_worker_status = True
    global keep_running
    while keep_running:
        if update_worker_status:
            Worker.update_worker_status(actor_id, worker_id, READY)
            update_worker_status = False
        try:
            msg = actor_ch.get(timeout=2)
        except channelpy.ChannelTimeoutException:
            continue
        except channelpy.ChannelClosedException:
            logger.info("Channel closed, worker exiting...")
            keep_running = False
            sys.exit()
        update_worker_status = True
        logger.info(
            "Received message {}. Starting actor container...".format(msg))
        # the msg object is a dictionary with an entry called message and an arbitrary
        # set of k:v pairs coming in from the query parameters.
        message = msg.pop('message', '')
        actor = Actor.from_db(actors_store[actor_id])
        execution_id = msg['_abaco_execution_id']

        # the execution object was created by the controller, but we need to add the worker id to it now that we
        # know which worker will be working on the execution.
        logger.debug("Adding worker_id to execution.")
        Execution.add_worker_id(actor_id, execution_id, worker_id)

        # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted.
        privileged = False
        if actor['privileged'] == 'TRUE':
            privileged = True
        logger.debug("privileged: {}".format(privileged))

        # retrieve the default environment registered with the actor.
        environment = actor['default_environment']
        logger.debug("Actor default environment: {}".format(environment))

        # overlay the default_environment registered for the actor with the msg
        # dictionary
        environment.update(msg)
        environment['_abaco_access_token'] = ''
        environment['_abaco_actor_dbid'] = actor_id
        environment['_abaco_actor_id'] = actor.id
        environment['_abaco_actor_state'] = actor.state
        logger.debug("Overlayed environment: {}".format(environment))

        # if we have an agave client, get a fresh set of tokens:
        if ag:
            try:
                ag.token.refresh()
                token = ag.token.token_info['access_token']
                environment['_abaco_access_token'] = token
                logger.info(
                    "Refreshed the tokens. Passed {} to the environment.".
                    format(token))
            except Exception as e:
                logger.error(
                    "Got an exception trying to get an access token: {}".
                    format(e))
        else:
            logger.info(
                "Agave client `ag` is None -- not passing access token.")
        logger.info("Passing update environment: {}".format(environment))
        try:
            stats, logs, final_state, exit_code = execute_actor(
                actor_id, worker_id, worker_ch, image, message, environment,
                privileged)
        except DockerStartContainerError as e:
            logger.error("Got DockerStartContainerError: {}".format(str(e)))
            Actor.set_status(actor_id, ERROR)
            continue
        # Add the completed stats to the execution
        logger.info(
            "Actor container finished successfully. Got stats object:{}".
            format(str(stats)))
        Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats,
                                     final_state, exit_code)
        logger.info("Added execution: {}".format(execution_id))

        # Add the logs to the execution
        Execution.set_logs(execution_id, logs)
        logger.info("Added execution logs.")

        # Update the worker's last updated and last execution fields:
        Worker.update_worker_execution_time(actor_id, worker_id)
        logger.info("worker time stamps updated.")
Exemplo n.º 35
0
    def test_execute_application_type_torque_qsub(self, mock_shell,
                                                  mock_add_nodes):
        """
        It verifies that the application type slurm sbatch is executed
        """

        # First we verify that the testbed is of type TORQUE to be able
        # to execute it, in this case it should give an error since it is
        # not of type torque

        # We define the different entities necessary for the test.
        testbed = Testbed(
            name="nova2",
            on_line=True,
            category="xxxx",
            protocol="SSH",
            endpoint="*****@*****.**",
            package_formats=['sbatch', 'SINGULARITY'],
            extra_config={
                "enqueue_compss_sc_cfg":
                "nova.cfg",
                "enqueue_env_file":
                "/home_nfs/home_ejarquej/installations/rc1707/COMPSs/compssenv"
            })
        db.session.add(testbed)

        application = Application(name="super_app")
        db.session.add(application)
        db.session.commit()  # So application and testbed get an id

        executable = Executable()
        executable.compilation_type = Executable.__type_torque_qsub__
        executable.executable_file = "pepito.sh"
        db.session.add(executable)
        db.session.commit()  # We do this so executable gets and id

        deployment = Deployment()
        deployment.testbed_id = testbed.id
        deployment.executable_id = executable.id
        db.session.add(
            deployment)  # We add the executable to the db so it has an id

        execution_config = ExecutionConfiguration()
        execution_config.execution_type = Executable.__type_torque_qsub__
        execution_config.application = application
        execution_config.testbed = testbed
        execution_config.executable = executable
        db.session.add(execution_config)
        db.session.commit()

        execution = Execution()
        execution.execution_type = Executable.__type_torque_qsub__
        execution.status = Execution.__status_submitted__

        torque.execute_batch(execution, execution_config.id)

        self.assertEquals(Execution.__status_failed__, execution.status)
        self.assertEquals("Testbed does not support TORQUE:QSUB applications",
                          execution.output)

        # If the testbed is off-line, execution isn't allowed also
        testbed.category = Testbed.torque_category
        testbed.on_line = False
        db.session.commit()

        execution = Execution()
        execution.execution_type = Executable.__type_torque_qsub__
        execution.status = Execution.__status_submitted__

        torque.execute_batch(execution, execution_config.id)

        self.assertEquals(Executable.__type_torque_qsub__,
                          execution.execution_type)
        self.assertEquals(Execution.__status_failed__, execution.status)
        self.assertEquals("Testbed is off-line", execution.output)

        ## Test executing
        output = b'1208.cloudserver'
        mock_shell.return_value = output

        testbed.category = Testbed.torque_category
        testbed.on_line = True
        db.session.commit()

        execution = Execution()
        execution.execution_type = Executable.__type_torque_qsub__
        execution.status = Execution.__status_submitted__

        torque.execute_batch(execution, execution_config.id)

        mock_shell.assert_called_with("qsub", "*****@*****.**",
                                      ["pepito.sh"])
        execution = db.session.query(Execution).filter_by(
            execution_configuration_id=execution_config.id).first()
        self.assertEqual(execution.execution_type,
                         execution_config.execution_type)
        self.assertEqual(execution.status, Execution.__status_running__)
        self.assertEqual("1208.cloudserver", execution.batch_id)
Exemplo n.º 36
0
def execute_actor(actor_id,
                  worker_id,
                  execution_id,
                  image,
                  msg,
                  user=None,
                  d={},
                  privileged=False,
                  mounts=[],
                  leave_container=False,
                  fifo_host_path=None,
                  socket_host_path=None,
                  mem_limit=None,
                  max_cpus=None):
    """
    Creates and runs an actor container and supervises the execution, collecting statistics about resource consumption
    from the Docker daemon.

    :param actor_id: the dbid of the actor; for updating worker status
    :param worker_id: the worker id; also for updating worker status
    :param execution_id: the id of the execution.
    :param image: the actor's image; worker must have already downloaded this image to the local docker registry.
    :param msg: the message being passed to the actor.
    :param user: string in the form {uid}:{gid} representing the uid and gid to run the command as.
    :param d: dictionary representing the environment to instantiate within the actor container.
    :param privileged: whether this actor is "privileged"; i.e., its container should run in privileged mode with the
    docker daemon mounted.
    :param mounts: list of dictionaries representing the mounts to add; each dictionary mount should have 3 keys:
    host_path, container_path and format (which should have value 'ro' or 'rw').
    :param fifo_host_path: If not None, a string representing a path on the host to a FIFO used for passing binary data to the actor.
    :param socket_host_path: If not None, a string representing a path on the host to a socket used for collecting results from the actor.
    :param mem_limit: The maximum amount of memory the Actor container can use; should be the same format as the --memory Docker flag.
    :param max_cpus: The maximum number of CPUs each actor will have available to them. Does not guarantee these CPU resources; serves as upper bound.
    :return: result (dict), logs (str) - `result`: statistics about resource consumption; `logs`: output from docker logs.
    """
    logger.debug("top of execute_actor(); (worker {};{})".format(
        worker_id, execution_id))

    # initially set the global force_quit variable to False
    globals.force_quit = False

    # initial stats object, environment, binds and volumes
    result = {'cpu': 0, 'io': 0, 'runtime': 0}

    # instantiate docker client
    cli = docker.APIClient(base_url=dd, version="auto")

    # don't try to pass binary messages through the environment as these can cause
    # broken pipe errors. the binary data will be passed through the FIFO momentarily.
    if not fifo_host_path:
        d['MSG'] = msg
    binds = {}
    volumes = []

    # if container is privileged, mount the docker daemon so that additional
    # containers can be started.
    logger.debug("privileged: {};(worker {};{})".format(
        privileged, worker_id, execution_id))
    if privileged:
        binds = {
            '/var/run/docker.sock': {
                'bind': '/var/run/docker.sock',
                'ro': False
            }
        }
        volumes = ['/var/run/docker.sock']

    # add a bind key and dictionary as well as a volume for each mount
    for m in mounts:
        binds[m.get('host_path')] = {
            'bind': m.get('container_path'),
            'ro': m.get('format') == 'ro'
        }
        volumes.append(m.get('host_path'))

    # mem_limit
    # -1 => unlimited memory
    if mem_limit == '-1':
        mem_limit = None

    # max_cpus
    try:
        max_cpus = int(max_cpus)
    except:
        max_cpus = None
    # -1 => unlimited cpus
    if max_cpus == -1:
        max_cpus = None

    host_config = cli.create_host_config(binds=binds,
                                         privileged=privileged,
                                         mem_limit=mem_limit,
                                         nano_cpus=max_cpus)
    logger.debug("host_config object created by (worker {};{}).".format(
        worker_id, execution_id))

    # write binary data to FIFO if it exists:
    if fifo_host_path:
        try:
            fifo = os.open(fifo_host_path, os.O_RDWR)
            os.write(fifo, msg)
        except Exception as e:
            logger.error(
                "Error writing the FIFO. Exception: {};(worker {};{})".format(
                    e, worker_id, execution_id))
            os.remove(fifo_host_path)
            raise DockerStartContainerError("Error writing to fifo: {}; "
                                            "(worker {};{})".format(
                                                e, worker_id, execution_id))

    # set up results socket -----------------------
    # make sure socket doesn't already exist:
    try:
        os.unlink(socket_host_path)
    except OSError as e:
        if os.path.exists(socket_host_path):
            logger.error(
                "socket at {} already exists; Exception: {}; (worker {};{})".
                format(socket_host_path, e, worker_id, execution_id))
            raise DockerStartContainerError(
                "Got an OSError trying to create the results docket; "
                "exception: {}".format(e))

    # use retry logic since, when the compute node is under load, we see errors initially trying to create the socket
    # server object.
    keep_trying = True
    count = 0
    server = None
    while keep_trying and count < 10:
        keep_trying = False
        count = count + 1
        try:
            server = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
        except Exception as e:
            keep_trying = True
            logger.info("Could not instantiate socket at {}. "
                        "Count: {}; Will keep trying. "
                        "Exception: {}; type: {}; (worker {};{})".format(
                            socket_host_path, count, e, type(e), worker_id,
                            execution_id))
        try:
            server.bind(socket_host_path)
        except Exception as e:
            keep_trying = True
            logger.info("Could not bind socket at {}. "
                        "Count: {}; Will keep trying. "
                        "Exception: {}; type: {}; (worker {};{})".format(
                            socket_host_path, count, e, type(e), worker_id,
                            execution_id))
        try:
            os.chmod(socket_host_path, 0o777)
            logger.debug(
                "results socket permissions set to 777. socket_host_path: {}".
                format(socket_host_path))
        except Exception as e:
            msg = f"Got exception trying to set permissions on the results socket. Not sure what to do. e: {e}"
            logger.error(msg)
            # for now, we'll just swallow it but this is really a TODO.

        try:
            server.settimeout(RESULTS_SOCKET_TIMEOUT)
        except Exception as e:
            keep_trying = True
            logger.info("Could not set timeout for socket at {}. "
                        "Count: {}; Will keep trying. "
                        "Exception: {}; type: {}; (worker {};{})".format(
                            socket_host_path, count, e, type(e), worker_id,
                            execution_id))
    if not server:
        msg = "Failed to instantiate results socket. " \
              "Abaco compute host could be overloaded. Exception: {}; (worker {};{})".format(e, worker_id, execution_id)
        logger.error(msg)
        raise DockerStartContainerError(msg)

    logger.debug(
        "results socket server instantiated. path: {} (worker {};{})".format(
            socket_host_path, worker_id, execution_id))

    # instantiate the results channel:
    results_ch = ExecutionResultsChannel(actor_id, execution_id)

    # create and start the container
    logger.debug("Final container environment: {};(worker {};{})".format(
        d, worker_id, execution_id))
    logger.debug(
        "Final binds: {} and host_config: {} for the container.(worker {};{})".
        format(binds, host_config, worker_id, execution_id))
    container = cli.create_container(image=image,
                                     environment=d,
                                     user=user,
                                     volumes=volumes,
                                     host_config=host_config)
    # get the UTC time stamp
    start_time = get_current_utc_time()
    # start the timer to track total execution time.
    start = timeit.default_timer()
    logger.debug("right before cli.start: {}; container id: {}; "
                 "(worker {};{})".format(start, container.get('Id'), worker_id,
                                         execution_id))
    try:
        cli.start(container=container.get('Id'))
    except Exception as e:
        # if there was an error starting the container, user will need to debug
        logger.info(
            "Got exception starting actor container: {}; (worker {};{})".
            format(e, worker_id, execution_id))
        raise DockerStartContainerError(
            "Could not start container {}. Exception {}".format(
                container.get('Id'), str(e)))

    # local bool tracking whether the actor container is still running
    running = True
    Execution.update_status(actor_id, execution_id, RUNNING)

    logger.debug("right before creating stats_cli: {}; (worker {};{})".format(
        timeit.default_timer(), worker_id, execution_id))
    # create a separate cli for checking stats objects since these should be fast and we don't want to wait
    stats_cli = docker.APIClient(base_url=dd, timeout=1, version="auto")
    logger.debug("right after creating stats_cli: {}; (worker {};{})".format(
        timeit.default_timer(), worker_id, execution_id))

    # under load, we can see UnixHTTPConnectionPool ReadTimeout's trying to create the stats_obj
    # so here we are trying up to 3 times to create the stats object for a possible total of 3s
    # timeouts
    ct = 0
    stats_obj = None
    logs = None
    while ct < 3:
        try:
            stats_obj = stats_cli.stats(container=container.get('Id'),
                                        decode=True)
            break
        except ReadTimeout:
            ct += 1
        except Exception as e:
            logger.error(
                "Unexpected exception creating stats_obj. Exception: {}; (worker {};{})"
                .format(e, worker_id, execution_id))
            # in this case, we need to kill the container since we cannot collect stats;
            # UPDATE - 07-2018: under load, a errors can occur attempting to create the stats object.
            # the container could still be running; we need to explicitly check the container status
            # to be sure.
    logger.debug(
        "right after attempting to create stats_obj: {}; (worker {};{})".
        format(timeit.default_timer(), worker_id, execution_id))
    # a counter of the number of iterations through the main "running" loop;
    # this counter is used to determine when less frequent actions, such as log aggregation, need to run.
    loop_idx = 0
    while running and not globals.force_quit:
        loop_idx += 1
        logger.debug(
            "top of while running loop; loop_idx: {}".format(loop_idx))
        datagram = None
        stats = None
        try:
            datagram = server.recv(MAX_RESULT_FRAME_SIZE)
        except socket.timeout:
            pass
        except Exception as e:
            logger.error(
                "got exception from server.recv: {}; (worker {};{})".format(
                    e, worker_id, execution_id))
        logger.debug(
            "right after try/except datagram block: {}; (worker {};{})".format(
                timeit.default_timer(), worker_id, execution_id))
        if datagram:
            try:
                results_ch.put(datagram)
            except Exception as e:
                logger.error(
                    "Error trying to put datagram on results channel. "
                    "Exception: {}; (worker {};{})".format(
                        e, worker_id, execution_id))
        logger.debug("right after results ch.put: {}; (worker {};{})".format(
            timeit.default_timer(), worker_id, execution_id))

        # only try to collect stats if we have a stats_obj:
        if stats_obj:
            logger.debug(
                "we have a stats_obj; trying to collect stats. (worker {};{})".
                format(worker_id, execution_id))
            try:
                logger.debug(
                    "waiting on a stats obj: {}; (worker {};{})".format(
                        timeit.default_timer(), worker_id, execution_id))
                stats = next(stats_obj)
                logger.debug("got the stats obj: {}; (worker {};{})".format(
                    timeit.default_timer(), worker_id, execution_id))
            except StopIteration:
                # we have read the last stats object - no need for processing
                logger.debug(
                    "Got StopIteration; no stats object. (worker {};{})".
                    format(worker_id, execution_id))
            except ReadTimeoutError:
                # this is a ReadTimeoutError from docker, not requests. container is finished.
                logger.info(
                    "next(stats) just timed out: {}; (worker {};{})".format(
                        timeit.default_timer(), worker_id, execution_id))
                # UPDATE - 07-2018: under load, a ReadTimeoutError from the attempt to get a stats object
                # does NOT imply the container has stopped; we need to explicitly check the container status
                # to be sure.

        # if we got a stats object, add it to the results; it is possible stats collection timed out and the object
        # is None
        if stats:
            logger.debug("adding stats to results; (worker {};{})".format(
                worker_id, execution_id))
            try:
                result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage']
            except KeyError as e:
                logger.info(
                    "Got a KeyError trying to fetch the cpu object: {}; "
                    "(worker {};{})".format(e, worker_id, execution_id))
            try:
                result['io'] += stats['networks']['eth0']['rx_bytes']
            except KeyError as e:
                logger.info(
                    "Got KeyError exception trying to grab the io object. "
                    "running: {}; Exception: {}; (worker {};{})".format(
                        running, e, worker_id, execution_id))

        # grab the logs every 5th iteration --
        if loop_idx % 5 == 0:
            logs = cli.logs(container.get('Id'))
            Execution.set_logs(execution_id, logs)
            logs = None

        # checking the container status to see if it is still running ----
        if running:
            logger.debug(
                "about to check container status: {}; (worker {};{})".format(
                    timeit.default_timer(), worker_id, execution_id))
            # we need to wait for the container id to be available
            i = 0
            while i < 10:
                try:
                    c = cli.containers(all=True,
                                       filters={'id': container.get('Id')})[0]
                    break
                except IndexError:
                    logger.error(
                        "Got an IndexError trying to get the container object. "
                        "(worker {};{})".format(worker_id, execution_id))
                    time.sleep(0.1)
                    i += 1
            logger.debug(
                "done checking status: {}; i: {}; (worker {};{})".format(
                    timeit.default_timer(), i, worker_id, execution_id))
            # if we were never able to get the container object, we need to stop processing and kill this
            # worker; the docker daemon could be under heavy load, but we need to not launch another
            # actor container with this worker, because the existing container may still be running,
            if i == 10 or not c:
                # we'll try to stop the container
                logger.error(
                    "Never could retrieve the container object! Attempting to stop container; "
                    "container id: {}; (worker {};{})".format(
                        container.get('Id'), worker_id, execution_id))
                # stop_container could raise an exception - if so, we let it pass up and have the worker
                # shut itself down.
                stop_container(cli, container.get('Id'))
                logger.info("container {} stopped. (worker {};{})".format(
                    container.get('Id'), worker_id, execution_id))

                # if we were able to stop the container, we can set running to False and keep the
                # worker running
                running = False
                continue
            state = c.get('State')
            if not state == 'running':
                logger.debug(
                    "container finished, final state: {}; (worker {};{})".
                    format(state, worker_id, execution_id))
                running = False
                continue
            else:
                # container still running; check if a force_quit has been sent OR
                # we are beyond the max_run_time
                runtime = timeit.default_timer() - start
                if globals.force_quit or (max_run_time > 0
                                          and max_run_time < runtime):
                    logs = cli.logs(container.get('Id'))
                    if globals.force_quit:
                        logger.info(
                            "issuing force quit: {}; (worker {};{})".format(
                                timeit.default_timer(), worker_id,
                                execution_id))
                    else:
                        logger.info(
                            "hit runtime limit: {}; (worker {};{})".format(
                                timeit.default_timer(), worker_id,
                                execution_id))
                    cli.stop(container.get('Id'))
                    running = False
            logger.debug(
                "right after checking container state: {}; (worker {};{})".
                format(timeit.default_timer(), worker_id, execution_id))
    logger.info("container stopped:{}; (worker {};{})".format(
        timeit.default_timer(), worker_id, execution_id))
    stop = timeit.default_timer()
    globals.force_quit = False

    # get info from container execution, including exit code; Exceptions from any of these commands
    # should not cause the worker to shutdown or prevent starting subsequent actor containers.
    try:
        container_info = cli.inspect_container(container.get('Id'))
        try:
            container_state = container_info['State']
            try:
                exit_code = container_state['ExitCode']
            except KeyError as e:
                logger.error("Could not determine ExitCode for container {}. "
                             "Exception: {}; (worker {};{})".format(
                                 container.get('Id'), e, worker_id,
                                 execution_id))
                exit_code = 'undetermined'
        except KeyError as e:
            logger.error(
                "Could not determine final state for container {}. "
                "Exception: {}; (worker {};{})".format(container.get('Id')), e,
                worker_id, execution_id)
            container_state = {'unavailable': True}
    except docker.errors.APIError as e:
        logger.error("Could not inspect container {}. "
                     "Exception: {}; (worker {};{})".format(
                         container.get('Id'), e, worker_id, execution_id))

    logger.debug(
        "right after getting container_info: {}; (worker {};{})".format(
            timeit.default_timer(), worker_id, execution_id))
    # get logs from container
    if not logs:
        logs = cli.logs(container.get('Id'))
    if not logs:
        # there are issues where container do not have logs associated with them when they should.
        logger.info("Container id {} had NO logs associated with it. "
                    "(worker {};{})".format(container.get('Id'), worker_id,
                                            execution_id))
    logger.debug(
        "right after getting container logs: {}; (worker {};{})".format(
            timeit.default_timer(), worker_id, execution_id))

    # get any additional results from the execution:
    while True:
        datagram = None
        try:
            datagram = server.recv(MAX_RESULT_FRAME_SIZE)
        except socket.timeout:
            break
        except Exception as e:
            logger.error(
                "Got exception from server.recv: {}; (worker {};{})".format(
                    e, worker_id, execution_id))
        if datagram:
            try:
                results_ch.put(datagram)
            except Exception as e:
                logger.error(
                    "Error trying to put datagram on results channel. "
                    "Exception: {}; (worker {};{})".format(
                        e, worker_id, execution_id))
    logger.debug(
        "right after getting last execution results from datagram socket: {}; "
        "(worker {};{})".format(timeit.default_timer(), worker_id,
                                execution_id))
    if socket_host_path:
        server.close()
        os.remove(socket_host_path)
    logger.debug("right after removing socket: {}; (worker {};{})".format(
        timeit.default_timer(), worker_id, execution_id))

    # remove actor container with retrying logic -- check for specific filesystem errors from the docker daemon:
    if not leave_container:
        keep_trying = True
        count = 0
        while keep_trying and count < 10:
            keep_trying = False
            count = count + 1
            try:
                cli.remove_container(container=container)
                logger.info("Actor container removed. (worker {};{})".format(
                    worker_id, execution_id))
            except Exception as e:
                # if the container is already gone we definitely want to quit:
                if 'No such container' in str(e):
                    logger.info("Got 'no such container' exception - quiting. "
                                "Exception: {}; (worker {};{})".format(
                                    e, worker_id, execution_id))
                    break
                # if we get a resource busy/internal server error from docker, we need to keep trying to remove the
                # container.
                elif 'device or resource busy' in str(
                        e) or 'failed to remove root filesystem' in str(e):
                    logger.error(
                        "Got resource busy/failed to remove filesystem exception trying to remove "
                        "actor container; will keep trying."
                        "Count: {}; Exception: {}; (worker {};{})".format(
                            count, e, worker_id, execution_id))
                    time.sleep(1)
                    keep_trying = True
                else:
                    logger.error(
                        "Unexpected exception trying to remove actor container. Giving up."
                        "Exception: {}; type: {}; (worker {};{})".format(
                            e, type(e), worker_id, execution_id))
    else:
        logger.debug("leaving actor container since leave_container was True. "
                     "(worker {};{})".format(worker_id, execution_id))
    logger.debug(
        "right after removing actor container: {}; (worker {};{})".format(
            timeit.default_timer(), worker_id, execution_id))

    if fifo_host_path:
        os.close(fifo)
        os.remove(fifo_host_path)
    if results_ch:
        results_ch.close()
    result['runtime'] = int(stop - start)
    logger.debug(
        "right after removing fifo; about to return: {}; (worker {};{})".
        format(timeit.default_timer(), worker_id, execution_id))
    return result, logs, container_state, exit_code, start_time
Exemplo n.º 37
0
 def getAllExecution(self, page_id):
     values = Execution.select().paginate(page_id, 2)
     arrayOfData = [model_to_dict(book) for book in values]
     return arrayOfData