def stop_operation(operation_id): """ Stop a thread for a given operation id """ operation = dao.try_get_operation_by_id(operation_id) if not operation or operation.has_finished: LOGGER.warning( "Operation already stopped or not found is given to stop job: %s" % operation_id) return True operation_process = dao.get_operation_process_for_operation( operation_id) result = 0 # Try to kill only if operation job process is not None if operation_process is not None: stop_command = TvbProfile.current.cluster.STOP_COMMAND % operation_process.job_id LOGGER.info("Stopping cluster operation: %s" % stop_command) result = os.system(stop_command) if result != 0: LOGGER.error( "Stopping cluster operation was unsuccessful. Try following status with '" + TvbProfile.current.cluster.STATUS_COMMAND + "'" % operation_process.job_id) WorkflowService().persist_operation_state(operation, model.STATUS_CANCELED) return result == 0
def get_operation_details(self, operation_gid, is_group): """ :returns: an entity OperationOverlayDetails filled with all information for current operation details. """ if is_group: operation_group = self.get_operation_group_by_gid(operation_gid) operation = dao.get_operations_in_group(operation_group.id, False, True) # Reload, to make sure all attributes lazy are populated as well. operation = dao.get_operation_by_gid(operation.gid) no_of_op_in_group = dao.get_operations_in_group(operation_group.id, is_count=True) datatype_group = self.get_datatypegroup_by_op_group_id(operation_group.id) count_result = dao.count_datatypes_in_group(datatype_group.id) else: operation = dao.get_operation_by_gid(operation_gid) if operation is None: return None no_of_op_in_group = 1 count_result = dao.count_resulted_datatypes(operation.id) user_display_name = dao.get_user_by_id(operation.fk_launched_by).display_name burst = dao.get_burst_for_operation_id(operation.id) datatypes_param, all_special_params = self._review_operation_inputs(operation.gid) op_pid = dao.get_operation_process_for_operation(operation.id) op_details = OperationOverlayDetails(operation, user_display_name, len(datatypes_param), count_result, burst, no_of_op_in_group, op_pid) # Add all parameter which are set differently by the user on this Operation. if all_special_params is not None: op_details.add_scientific_fields(all_special_params) return op_details
def stop_operation(operation_id): """ Stop a thread for a given operation id """ operation = dao.try_get_operation_by_id(operation_id) if not operation or operation.has_finished: LOGGER.warning("Operation already stopped or not found is given to stop job: %s" % operation_id) return True LOGGER.debug("Stopping operation: %s" % str(operation_id)) ## Set the thread stop flag to true for thread in CURRENT_ACTIVE_THREADS: if int(thread.operation_id) == operation_id: thread.stop() LOGGER.debug("Found running thread for operation: %d" % operation_id) ## Kill Thread stopped = True operation_process = dao.get_operation_process_for_operation(operation_id) if operation_process is not None: ## Now try to kill the operation if it exists stopped = OperationExecutor.stop_pid(operation_process.pid) if not stopped: LOGGER.debug("Operation %d was probably killed from it's specific thread." % operation_id) else: LOGGER.debug("Stopped OperationExecutor process for %d" % operation_id) ## Mark operation as canceled in DB and on disk WorkflowService().persist_operation_state(operation, model.STATUS_CANCELED) return stopped
def get_operation_details(self, operation_gid, is_group): """ :returns: an entity OperationOverlayDetails filled with all information for current operation details. """ if is_group: operation_group = self.get_operation_group_by_gid(operation_gid) operation = dao.get_operations_in_group(operation_group.id, False, True) ## Reload, to make sure all attributes lazy are populated as well. operation = dao.get_operation_by_gid(operation.gid) no_of_op_in_group = dao.get_operations_in_group(operation_group.id, is_count=True) datatype_group = self.get_datatypegroup_by_op_group_id(operation_group.id) count_result = dao.count_datatypes_in_group(datatype_group.id) else: operation = dao.get_operation_by_gid(operation_gid) if operation is None: return None no_of_op_in_group = 1 count_result = dao.count_resulted_datatypes(operation.id) username = dao.get_user_by_id(operation.fk_launched_by).username burst = dao.get_burst_for_operation_id(operation.id) datatypes_param, all_special_params = ProjectService._review_operation_inputs(operation.gid) op_pid = dao.get_operation_process_for_operation(operation.id) op_details = OperationOverlayDetails(operation, username, len(datatypes_param), count_result, burst, no_of_op_in_group, op_pid) ## Add all parameter which are set differently by the user on this Operation. if all_special_params is not None: op_details.add_scientific_fields(all_special_params) return op_details
def stop_operation(operation_id): """ Stop a thread for a given operation id """ operation = dao.try_get_operation_by_id(operation_id) if not operation or operation.has_finished: LOGGER.info("Operation already stopped or not found at ID: %s" % operation_id) return True LOGGER.debug("Stopping operation: %s" % str(operation_id)) # Set the thread stop flag to true for thread in CURRENT_ACTIVE_THREADS: if int(thread.operation_id) == operation_id: thread._stop() LOGGER.debug("Found running thread for operation: %d" % operation_id) # Kill Thread stopped = True operation_process = dao.get_operation_process_for_operation(operation_id) if operation_process is not None: # Now try to kill the operation if it exists stopped = OperationExecutor.stop_pid(operation_process.pid) if not stopped: LOGGER.debug("Operation %d was probably killed from it's specific thread." % operation_id) else: LOGGER.debug("Stopped OperationExecutor process for %d" % operation_id) # Mark operation as canceled in DB and on disk BurstService().persist_operation_state(operation, STATUS_CANCELED) return stopped
def _operation_finished(operation, simulator_gid): op_ident = dao.get_operation_process_for_operation(operation.id) # TODO: Handle login job = Job( Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]), op_ident.job_id) operation = dao.get_operation_by_id(operation.id) folder = HPCSchedulerClient.storage_interface.get_project_folder( operation.project.name) storage_interface = StorageInterface() if storage_interface.encryption_enabled(): storage_interface.inc_project_usage_count(folder) storage_interface.sync_folders(folder) try: sim_h5_filenames, metric_op, metric_h5_filename = \ HPCSchedulerClient.stage_out_to_operation_folder(job.working_dir, operation, simulator_gid) operation.mark_complete(STATUS_FINISHED) dao.store_entity(operation) HPCSchedulerClient().update_db_with_results( operation, sim_h5_filenames, metric_op, metric_h5_filename) except OperationException as exception: HPCOperationService.LOGGER.error(exception) HPCOperationService._operation_error(operation) finally: if storage_interface.encryption_enabled(): storage_interface.sync_folders(folder) storage_interface.set_project_inactive(operation.project)
def stop_operation(operation_id): """ Stop a thread for a given operation id """ operation = dao.get_operation_by_id(operation_id) if not operation or operation.status != model.STATUS_STARTED: LOGGER.warn("Operation %d was not found or has not the correct status, to be stopped." % operation_id) return False LOGGER.debug("Stopping operation: %s" % str(operation_id)) ## Set the thread stop flag to true for thread in CURRENT_ACTIVE_THREADS: if int(thread.operation_id) == operation_id: thread.stop() LOGGER.debug("Found running thread for operation: %d" % operation_id) ## Kill Thread stopped = True operation_process = dao.get_operation_process_for_operation(operation_id) if operation_process is not None: ## Now try to kill the operation if it exists stopped = OperationExecutor.stop_pid(operation_process.pid) if not stopped: LOGGER.debug("Operation %d was probably killed from it's specific thread." % operation_id) else: LOGGER.debug("Stopped OperationExecutor process for %d" % operation_id) ## Mark operation as canceled in DB. operation.mark_cancelled() dao.store_entity(operation) return stopped
def process_queued_operations(): LOGGER.info("Start Queued operations job") try: operations = dao.get_generic_entity(Operation, True, "queue_full") if len(operations) == 0: return LOGGER.info( "Found {} operations with the queue full flag set.".format( len(operations))) operations.sort(key=lambda l_operation: l_operation.id) for operation in operations[0:TvbProfile.current. MAX_THREADS_NUMBER]: try: op = dao.get_operation_by_id(operation.id) operation_process = dao.get_operation_process_for_operation( operation.id) if operation_process is not None or not op.queue_full or LOCKS_QUEUE.qsize( ) == 0: continue LOCKS_QUEUE.get() StandAloneClient.start_operation(operation.id) except Exception as e: LOGGER.error("Starting operation error", e) except Exception as e: LOGGER.error("Error", e)
def check_operations_job(): operations = dao.get_operations() if operations is None or len(operations) == 0: return for operation in operations: HPCOperationService.LOGGER.info("Start processing operation {}".format(operation.id)) try: op_ident = dao.get_operation_process_for_operation(operation.id) if op_ident is not None: transport = Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]) job = Job(transport, op_ident.job_id) job_status = job.properties['status'] if job.is_running(): if operation.status == STATUS_PENDING and job_status == HPCJobStatus.READY.value: HPCOperationService._operation_started(operation) HPCOperationService.LOGGER.info( "CSCS job status: {} for operation {}.".format(job_status, operation.id)) return HPCOperationService.LOGGER.info( "Job for operation {} has status {}".format(operation.id, job_status)) if job_status == HPCJobStatus.SUCCESSFUL.value: simulator_gid = operation.view_model_gid HPCOperationService._operation_finished(operation, simulator_gid) else: HPCOperationService._operation_error(operation) except Exception: HPCOperationService.LOGGER.error( "There was an error on background processing process for operation {}".format(operation.id), exc_info=True)
def _operation_finished(operation, simulator_gid): op_ident = dao.get_operation_process_for_operation(operation.id) # TODO: Handle login job = Job( Transport(os.environ[HPCSchedulerClient.CSCS_LOGIN_TOKEN_ENV_KEY]), op_ident.job_id) sim_h5_filenames, metric_op, metric_h5_filename = \ HPCSchedulerClient.stage_out_to_operation_folder(job.working_dir, operation, simulator_gid) operation.mark_complete(STATUS_FINISHED) dao.store_entity(operation) HPCSchedulerClient().update_db_with_results(operation, sim_h5_filenames, metric_op, metric_h5_filename)
def stop_operation_process(operation_id, notify_pods=False): if notify_pods and TvbProfile.current.web.OPENSHIFT_DEPLOY: try: LOGGER.info( "Notify pods to stop operation process for {}".format( operation_id)) KubeService.notify_pods( "/kube/stop_operation_process/{}".format(operation_id), TvbProfile.current.web. OPENSHIFT_PROCESSING_OPERATIONS_APPLICATION) return True except Exception as e: LOGGER.error("Stop operation notify error", e) return False else: # Set the thread stop flag to true operation_threads = [] for thread in CURRENT_ACTIVE_THREADS: if int(thread.operation_id) == operation_id: operation_threads.append(thread) if len(operation_threads) > 0: for thread in operation_threads: thread._stop() LOGGER.info("Found running thread for operation: %d" % operation_id) LOGGER.info("Thread marked to stop: {}".format( thread.stopped())) LOGGER.info("Thread: {}".format(thread)) # Kill Thread stopped = True operation_process = dao.get_operation_process_for_operation( operation_id) if operation_process is not None: # Now try to kill the operation if it exists stopped = OperationExecutor.stop_pid(operation_process.pid) if not stopped: LOGGER.debug( "Operation %d was probably killed from it's specific thread." % operation_id) else: LOGGER.debug( "Stopped OperationExecutor process for %d" % operation_id) return stopped LOGGER.info("Running thread was not found for operation {}".format( operation_id)) return False
def stop_operation(operation_id): """ Stop a thread for a given operation id """ operation = dao.get_operation_by_id(operation_id) if not operation or operation.status != model.STATUS_STARTED: return False operation_process = dao.get_operation_process_for_operation(operation_id) result = 0 ## Try to kill only if operation job process is not None if operation_process is not None: LOGGER.debug("Stopping cluster operation: %s, with job id: %s" % (operation_id, operation_process.job_id)) result = os.system(config.CLUSTER_STOP_COMMAND % operation_process.job_id) ## Set operation as canceled, if kill command succeed, otherwise no operation process was found... if result == 0: operation.mark_cancelled() dao.store_entity(operation) return True return False
def stop_operation(operation_id): """ Stop a thread for a given operation id """ operation = dao.try_get_operation_by_id(operation_id) if not operation or operation.has_finished: LOGGER.warning("Operation already stopped or not found is given to stop job: %s" % operation_id) return True operation_process = dao.get_operation_process_for_operation(operation_id) result = 0 ## Try to kill only if operation job process is not None if operation_process is not None: stop_command = TvbProfile.current.cluster.STOP_COMMAND % operation_process.job_id LOGGER.info("Stopping cluster operation: %s" % stop_command) result = os.system(stop_command) if result != 0: LOGGER.error("Stopping cluster operation was unsuccessful. Try following status with '" + TvbProfile.current.cluster.STATUS_COMMAND + "'" % operation_process.job_id) WorkflowService().persist_operation_state(operation, model.STATUS_CANCELED) return result == 0
def stop_operation(operation_id): """ Stop a thread for a given operation id """ operation = dao.get_operation_by_id(operation_id) if not operation or operation.status != model.STATUS_STARTED: LOGGER.warning("Operation already stopped or not found is given to stop job: %s" % operation_id) return True operation_process = dao.get_operation_process_for_operation(operation_id) result = 0 ## Try to kill only if operation job process is not None if operation_process is not None: stop_command = config.CLUSTER_STOP_COMMAND % operation_process.job_id LOGGER.info("Stopping cluster operation: %s" % stop_command) result = os.system(stop_command) if result != 0: LOGGER.error("Stopping cluster operation was unsuccessful. " "Try following with 'oarstat' for job ID: %s" % operation_process.job_id) operation.mark_cancelled() dao.store_entity(operation) return result == 0
def stop_operation(operation_id): """ Stop a thread for a given operation id """ operation = dao.try_get_operation_by_id(operation_id) if not operation or operation.has_finished: LOGGER.warning("Operation already stopped or not found is given to stop job: %s" % operation_id) return True operation_process = dao.get_operation_process_for_operation(operation_id) result = 0 ## Try to kill only if operation job process is not None if operation_process is not None: stop_command = TvbProfile.current.cluster.STOP_COMMAND % operation_process.job_id LOGGER.info("Stopping cluster operation: %s" % stop_command) result = os.system(stop_command) if result != 0: LOGGER.error("Stopping cluster operation was unsuccessful. " "Try following with 'oarstat' for job ID: %s" % operation_process.job_id) operation.mark_complete(model.STATUS_CANCELED) dao.store_entity(operation) return result == 0