Пример #1
0
    def stop_operation(operation_id):
        """
        Stop a thread for a given operation id
        """
        operation = dao.try_get_operation_by_id(operation_id)
        if not operation or operation.has_finished:
            LOGGER.warning(
                "Operation already stopped or not found is given to stop job: %s"
                % operation_id)
            return True

        operation_process = dao.get_operation_process_for_operation(
            operation_id)
        result = 0
        # Try to kill only if operation job process is not None
        if operation_process is not None:
            stop_command = TvbProfile.current.cluster.STOP_COMMAND % operation_process.job_id
            LOGGER.info("Stopping cluster operation: %s" % stop_command)
            result = os.system(stop_command)
            if result != 0:
                LOGGER.error(
                    "Stopping cluster operation was unsuccessful. Try following status with '"
                    + TvbProfile.current.cluster.STATUS_COMMAND +
                    "'" % operation_process.job_id)

        WorkflowService().persist_operation_state(operation,
                                                  model.STATUS_CANCELED)

        return result == 0
Пример #2
0
    def get_operation_details(self, operation_gid, is_group):
        """
        :returns: an entity OperationOverlayDetails filled with all information for current operation details.
        """

        if is_group:
            operation_group = self.get_operation_group_by_gid(operation_gid)
            operation = dao.get_operations_in_group(operation_group.id, False, True)
            # Reload, to make sure all attributes lazy are populated as well.
            operation = dao.get_operation_by_gid(operation.gid)
            no_of_op_in_group = dao.get_operations_in_group(operation_group.id, is_count=True)
            datatype_group = self.get_datatypegroup_by_op_group_id(operation_group.id)
            count_result = dao.count_datatypes_in_group(datatype_group.id)

        else:
            operation = dao.get_operation_by_gid(operation_gid)
            if operation is None:
                return None
            no_of_op_in_group = 1
            count_result = dao.count_resulted_datatypes(operation.id)

        user_display_name = dao.get_user_by_id(operation.fk_launched_by).display_name
        burst = dao.get_burst_for_operation_id(operation.id)
        datatypes_param, all_special_params = self._review_operation_inputs(operation.gid)

        op_pid = dao.get_operation_process_for_operation(operation.id)
        op_details = OperationOverlayDetails(operation, user_display_name, len(datatypes_param),
                                             count_result, burst, no_of_op_in_group, op_pid)

        # Add all parameter which are set differently by the user on this Operation.
        if all_special_params is not None:
            op_details.add_scientific_fields(all_special_params)
        return op_details
    def stop_operation(operation_id):
        """
        Stop a thread for a given operation id
        """
        operation = dao.try_get_operation_by_id(operation_id)
        if not operation or operation.has_finished:
            LOGGER.warning("Operation already stopped or not found is given to stop job: %s" % operation_id)
            return True

        LOGGER.debug("Stopping operation: %s" % str(operation_id))

        ## Set the thread stop flag to true
        for thread in CURRENT_ACTIVE_THREADS:
            if int(thread.operation_id) == operation_id:
                thread.stop()
                LOGGER.debug("Found running thread for operation: %d" % operation_id)

        ## Kill Thread
        stopped = True
        operation_process = dao.get_operation_process_for_operation(operation_id)
        if operation_process is not None:
            ## Now try to kill the operation if it exists
            stopped = OperationExecutor.stop_pid(operation_process.pid)
            if not stopped:
                LOGGER.debug("Operation %d was probably killed from it's specific thread." % operation_id)
            else:
                LOGGER.debug("Stopped OperationExecutor process for %d" % operation_id)

        ## Mark operation as canceled in DB and on disk
        WorkflowService().persist_operation_state(operation, model.STATUS_CANCELED)

        return stopped
Пример #4
0
    def get_operation_details(self, operation_gid, is_group):
        """
        :returns: an entity OperationOverlayDetails filled with all information for current operation details.
        """

        if is_group:
            operation_group = self.get_operation_group_by_gid(operation_gid)
            operation = dao.get_operations_in_group(operation_group.id, False, True)
            ## Reload, to make sure all attributes lazy are populated as well.
            operation = dao.get_operation_by_gid(operation.gid)
            no_of_op_in_group = dao.get_operations_in_group(operation_group.id, is_count=True)
            datatype_group = self.get_datatypegroup_by_op_group_id(operation_group.id)
            count_result = dao.count_datatypes_in_group(datatype_group.id)

        else:
            operation = dao.get_operation_by_gid(operation_gid)
            if operation is None:
                return None
            no_of_op_in_group = 1
            count_result = dao.count_resulted_datatypes(operation.id)

        username = dao.get_user_by_id(operation.fk_launched_by).username
        burst = dao.get_burst_for_operation_id(operation.id)
        datatypes_param, all_special_params = ProjectService._review_operation_inputs(operation.gid)

        op_pid = dao.get_operation_process_for_operation(operation.id)
        op_details = OperationOverlayDetails(operation, username, len(datatypes_param),
                                             count_result, burst, no_of_op_in_group, op_pid)

        ## Add all parameter which are set differently by the user on this Operation.
        if all_special_params is not None:
            op_details.add_scientific_fields(all_special_params)
        return op_details
Пример #5
0
    def stop_operation(operation_id):
        """
        Stop a thread for a given operation id
        """
        operation = dao.try_get_operation_by_id(operation_id)
        if not operation or operation.has_finished:
            LOGGER.info("Operation already stopped or not found at ID: %s" % operation_id)
            return True

        LOGGER.debug("Stopping operation: %s" % str(operation_id))

        # Set the thread stop flag to true
        for thread in CURRENT_ACTIVE_THREADS:
            if int(thread.operation_id) == operation_id:
                thread._stop()
                LOGGER.debug("Found running thread for operation: %d" % operation_id)

        # Kill Thread
        stopped = True
        operation_process = dao.get_operation_process_for_operation(operation_id)
        if operation_process is not None:
            # Now try to kill the operation if it exists
            stopped = OperationExecutor.stop_pid(operation_process.pid)
            if not stopped:
                LOGGER.debug("Operation %d was probably killed from it's specific thread." % operation_id)
            else:
                LOGGER.debug("Stopped OperationExecutor process for %d" % operation_id)

        # Mark operation as canceled in DB and on disk
        BurstService().persist_operation_state(operation, STATUS_CANCELED)

        return stopped
Пример #6
0
    def _operation_finished(operation, simulator_gid):
        op_ident = dao.get_operation_process_for_operation(operation.id)
        # TODO: Handle login
        job = Job(
            Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]),
            op_ident.job_id)

        operation = dao.get_operation_by_id(operation.id)
        folder = HPCSchedulerClient.storage_interface.get_project_folder(
            operation.project.name)
        storage_interface = StorageInterface()
        if storage_interface.encryption_enabled():
            storage_interface.inc_project_usage_count(folder)
            storage_interface.sync_folders(folder)

        try:
            sim_h5_filenames, metric_op, metric_h5_filename = \
                HPCSchedulerClient.stage_out_to_operation_folder(job.working_dir, operation, simulator_gid)

            operation.mark_complete(STATUS_FINISHED)
            dao.store_entity(operation)
            HPCSchedulerClient().update_db_with_results(
                operation, sim_h5_filenames, metric_op, metric_h5_filename)

        except OperationException as exception:
            HPCOperationService.LOGGER.error(exception)
            HPCOperationService._operation_error(operation)

        finally:
            if storage_interface.encryption_enabled():
                storage_interface.sync_folders(folder)
                storage_interface.set_project_inactive(operation.project)
Пример #7
0
    def stop_operation(operation_id):
        """
        Stop a thread for a given operation id
        """
        operation = dao.get_operation_by_id(operation_id)
        if not operation or operation.status != model.STATUS_STARTED:
            LOGGER.warn("Operation %d was not found or has not the correct status, to be stopped." % operation_id)
            return False
        LOGGER.debug("Stopping operation: %s" % str(operation_id))

        ## Set the thread stop flag to true
        for thread in CURRENT_ACTIVE_THREADS:
            if int(thread.operation_id) == operation_id:
                thread.stop()
                LOGGER.debug("Found running thread for operation: %d" % operation_id)

        ## Kill Thread
        stopped = True
        operation_process = dao.get_operation_process_for_operation(operation_id)
        if operation_process is not None:
            ## Now try to kill the operation if it exists
            stopped = OperationExecutor.stop_pid(operation_process.pid)
            if not stopped:
                LOGGER.debug("Operation %d was probably killed from it's specific thread." % operation_id)
            else:
                LOGGER.debug("Stopped OperationExecutor process for %d" % operation_id)

        ## Mark operation as canceled in DB.
        operation.mark_cancelled()
        dao.store_entity(operation)
        return stopped
Пример #8
0
 def process_queued_operations():
     LOGGER.info("Start Queued operations job")
     try:
         operations = dao.get_generic_entity(Operation, True, "queue_full")
         if len(operations) == 0:
             return
         LOGGER.info(
             "Found {} operations with the queue full flag set.".format(
                 len(operations)))
         operations.sort(key=lambda l_operation: l_operation.id)
         for operation in operations[0:TvbProfile.current.
                                     MAX_THREADS_NUMBER]:
             try:
                 op = dao.get_operation_by_id(operation.id)
                 operation_process = dao.get_operation_process_for_operation(
                     operation.id)
                 if operation_process is not None or not op.queue_full or LOCKS_QUEUE.qsize(
                 ) == 0:
                     continue
                 LOCKS_QUEUE.get()
                 StandAloneClient.start_operation(operation.id)
             except Exception as e:
                 LOGGER.error("Starting operation error", e)
     except Exception as e:
         LOGGER.error("Error", e)
Пример #9
0
    def check_operations_job():
        operations = dao.get_operations()
        if operations is None or len(operations) == 0:
            return

        for operation in operations:
            HPCOperationService.LOGGER.info("Start processing operation {}".format(operation.id))
            try:
                op_ident = dao.get_operation_process_for_operation(operation.id)
                if op_ident is not None:
                    transport = Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY])
                    job = Job(transport, op_ident.job_id)
                    job_status = job.properties['status']
                    if job.is_running():
                        if operation.status == STATUS_PENDING and job_status == HPCJobStatus.READY.value:
                            HPCOperationService._operation_started(operation)
                        HPCOperationService.LOGGER.info(
                            "CSCS job status: {} for operation {}.".format(job_status, operation.id))
                        return
                    HPCOperationService.LOGGER.info(
                        "Job for operation {} has status {}".format(operation.id, job_status))
                    if job_status == HPCJobStatus.SUCCESSFUL.value:
                        simulator_gid = operation.view_model_gid
                        HPCOperationService._operation_finished(operation, simulator_gid)
                    else:
                        HPCOperationService._operation_error(operation)
            except Exception:
                HPCOperationService.LOGGER.error(
                    "There was an error on background processing process for operation {}".format(operation.id),
                    exc_info=True)
 def _operation_finished(operation, simulator_gid):
     op_ident = dao.get_operation_process_for_operation(operation.id)
     # TODO: Handle login
     job = Job(
         Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]),
         op_ident.job_id)
     sim_h5_filenames, metric_op, metric_h5_filename = \
         HPCSchedulerClient.stage_out_to_operation_folder(job.working_dir, operation, simulator_gid)
     operation.mark_complete(STATUS_FINISHED)
     dao.store_entity(operation)
     HPCSchedulerClient().update_db_with_results(operation,
                                                 sim_h5_filenames,
                                                 metric_op,
                                                 metric_h5_filename)
Пример #11
0
    def stop_operation_process(operation_id, notify_pods=False):
        if notify_pods and TvbProfile.current.web.OPENSHIFT_DEPLOY:
            try:
                LOGGER.info(
                    "Notify pods to stop operation process for {}".format(
                        operation_id))
                KubeService.notify_pods(
                    "/kube/stop_operation_process/{}".format(operation_id),
                    TvbProfile.current.web.
                    OPENSHIFT_PROCESSING_OPERATIONS_APPLICATION)
                return True
            except Exception as e:
                LOGGER.error("Stop operation notify error", e)
                return False
        else:
            # Set the thread stop flag to true
            operation_threads = []
            for thread in CURRENT_ACTIVE_THREADS:
                if int(thread.operation_id) == operation_id:
                    operation_threads.append(thread)

            if len(operation_threads) > 0:
                for thread in operation_threads:
                    thread._stop()
                    LOGGER.info("Found running thread for operation: %d" %
                                operation_id)
                    LOGGER.info("Thread marked to stop: {}".format(
                        thread.stopped()))
                    LOGGER.info("Thread: {}".format(thread))
                # Kill Thread
                stopped = True
                operation_process = dao.get_operation_process_for_operation(
                    operation_id)
                if operation_process is not None:
                    # Now try to kill the operation if it exists
                    stopped = OperationExecutor.stop_pid(operation_process.pid)
                    if not stopped:
                        LOGGER.debug(
                            "Operation %d was probably killed from it's specific thread."
                            % operation_id)
                    else:
                        LOGGER.debug(
                            "Stopped OperationExecutor process for %d" %
                            operation_id)
                return stopped

            LOGGER.info("Running thread was not found for operation {}".format(
                operation_id))
            return False
Пример #12
0
    def stop_operation(operation_id):
        """
        Stop a thread for a given operation id
        """
        operation = dao.get_operation_by_id(operation_id)
        if not operation or operation.status != model.STATUS_STARTED:
            return False

        operation_process = dao.get_operation_process_for_operation(operation_id)
        result = 0
        ## Try to kill only if operation job process is not None
        if operation_process is not None:
            LOGGER.debug("Stopping cluster operation: %s, with job id: %s" % (operation_id, operation_process.job_id))
            result = os.system(config.CLUSTER_STOP_COMMAND % operation_process.job_id)

        ## Set operation as canceled, if kill command succeed, otherwise no operation process was found...
        if result == 0:
            operation.mark_cancelled()
            dao.store_entity(operation)
            return True
        return False
    def stop_operation(operation_id):
        """
        Stop a thread for a given operation id
        """
        operation = dao.try_get_operation_by_id(operation_id)
        if not operation or operation.has_finished:
            LOGGER.warning("Operation already stopped or not found is given to stop job: %s" % operation_id)
            return True

        operation_process = dao.get_operation_process_for_operation(operation_id)
        result = 0
        ## Try to kill only if operation job process is not None
        if operation_process is not None:
            stop_command = TvbProfile.current.cluster.STOP_COMMAND % operation_process.job_id
            LOGGER.info("Stopping cluster operation: %s" % stop_command)
            result = os.system(stop_command)
            if result != 0:
                LOGGER.error("Stopping cluster operation was unsuccessful. Try following status with '" +
                             TvbProfile.current.cluster.STATUS_COMMAND + "'" % operation_process.job_id)

        WorkflowService().persist_operation_state(operation, model.STATUS_CANCELED)

        return result == 0
    def stop_operation(operation_id):
        """
        Stop a thread for a given operation id
        """
        operation = dao.get_operation_by_id(operation_id)
        if not operation or operation.status != model.STATUS_STARTED:
            LOGGER.warning("Operation already stopped or not found is given to stop job: %s" % operation_id)
            return True

        operation_process = dao.get_operation_process_for_operation(operation_id)
        result = 0
        ## Try to kill only if operation job process is not None
        if operation_process is not None:
            stop_command = config.CLUSTER_STOP_COMMAND % operation_process.job_id
            LOGGER.info("Stopping cluster operation: %s" % stop_command)
            result = os.system(stop_command)
            if result != 0:
                LOGGER.error("Stopping cluster operation was unsuccessful. "
                             "Try following with 'oarstat' for job ID: %s" % operation_process.job_id)

        operation.mark_cancelled()
        dao.store_entity(operation)

        return result == 0
Пример #15
0
    def stop_operation(operation_id):
        """
        Stop a thread for a given operation id
        """
        operation = dao.try_get_operation_by_id(operation_id)
        if not operation or operation.has_finished:
            LOGGER.warning("Operation already stopped or not found is given to stop job: %s" % operation_id)
            return True

        operation_process = dao.get_operation_process_for_operation(operation_id)
        result = 0
        ## Try to kill only if operation job process is not None
        if operation_process is not None:
            stop_command = TvbProfile.current.cluster.STOP_COMMAND % operation_process.job_id
            LOGGER.info("Stopping cluster operation: %s" % stop_command)
            result = os.system(stop_command)
            if result != 0:
                LOGGER.error("Stopping cluster operation was unsuccessful. "
                             "Try following with 'oarstat' for job ID: %s" % operation_process.job_id)

        operation.mark_complete(model.STATUS_CANCELED)
        dao.store_entity(operation)

        return result == 0