示例#1
0
 def request_stop_jobs(cls, jobs: [Job], stop_msg, stop_status):
     if not len(jobs):
         return
     detect_logger().info(
         f"have {len(jobs)} should be stopped, because of {stop_msg}")
     for job in jobs:
         try:
             detect_logger(job_id=job.f_job_id).info(
                 f"detector request start to stop job {job.f_job_id}, because of {stop_msg}"
             )
             FederatedScheduler.request_stop_job(job=job,
                                                 stop_status=stop_status)
             detect_logger(job_id=job.f_job_id).info(
                 f"detector request stop job {job.f_job_id} successfully")
         except Exception as e:
             detect_logger(job_id=job.f_job_id).exception(e)
示例#2
0
 def detect_running_job(cls):
     detect_logger().info('start detect running job')
     try:
         running_jobs = JobSaver.query_job(status=JobStatus.RUNNING,
                                           is_initiator=True)
         stop_jobs = set()
         for job in running_jobs:
             try:
                 if job_utils.check_job_is_timeout(job):
                     stop_jobs.add(job)
             except Exception as e:
                 detect_logger(job_id=job.f_job_id).exception(e)
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="running timeout",
                               stop_status=JobStatus.TIMEOUT)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info('finish detect running job')
示例#3
0
 def detect_expired_session(cls):
     detect_logger().info('start detect expired session')
     sessions_record = StorageSessionBase.query_expired_sessions_record(
         ttl=5 * 60 * 60 * 1000)
     for session_record in sessions_record:
         detect_logger().info('start stop session id {}'.format(
             session_record.f_session_id))
         session = storage.Session.build(
             session_id=session_record.f_session_id,
             storage_engine=session_record.f_engine_name)
         session.destroy_session()
         detect_logger().info('session id {} success'.format(
             session_record.f_session_id))
示例#4
0
 def detect_resource_record(cls):
     detect_logger().info('start detect resource recycle')
     try:
         filter_status = EndStatus.status_list()
         filter_status.append(JobStatus.WAITING)
         jobs = Job.select().where(
             Job.f_resource_in_use == True,
             current_timestamp() - Job.f_apply_resource_time >
             10 * 60 * 1000, Job.f_status << filter_status)
         stop_jobs = set()
         for job in jobs:
             if job.f_status == JobStatus.WAITING:
                 stop_jobs.add(job)
             else:
                 try:
                     detect_logger(job_id=job.f_job_id).info(
                         f"start to return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource"
                     )
                     flag = ResourceManager.return_job_resource(
                         job_id=job.f_job_id,
                         role=job.f_role,
                         party_id=job.f_party_id)
                     if flag:
                         detect_logger(job_id=job.f_job_id).info(
                             f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource successfully"
                         )
                     else:
                         detect_logger(job_id=job.f_job_id).info(
                             f"return job {job.f_job_id} on {job.f_role} {job.f_party_id} resource failed"
                         )
                 except Exception as e:
                     detect_logger(job_id=job.f_job_id).exception(e)
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="start timeout",
                               stop_status=JobStatus.TIMEOUT)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info('finish detect resource recycle')
示例#5
0
 def detect_running_task(cls):
     detect_logger().info('start to detect running task..')
     count = 0
     try:
         running_tasks = JobSaver.query_task(
             party_status=TaskStatus.RUNNING,
             run_on_this_party=True,
             run_ip=RuntimeConfig.JOB_SERVER_HOST,
             only_latest=False)
         stop_job_ids = set()
         for task in running_tasks:
             count += 1
             try:
                 process_exist = job_utils.check_job_process(
                     int(task.f_run_pid))
                 if not process_exist:
                     detect_logger(job_id=task.f_job_id).info(
                         'job {} task {} {} on {} {} process {} does not exist'
                         .format(task.f_job_id, task.f_task_id,
                                 task.f_task_version, task.f_role,
                                 task.f_party_id, task.f_run_pid))
                     stop_job_ids.add(task.f_job_id)
             except Exception as e:
                 detect_logger(job_id=task.f_job_id).exception(e)
         if stop_job_ids:
             detect_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         stop_jobs = set()
         for job_id in stop_job_ids:
             jobs = JobSaver.query_job(job_id=job_id)
             if jobs:
                 stop_jobs.add(jobs[0])
         cls.request_stop_jobs(jobs=stop_jobs,
                               stop_msg="task executor process abort",
                               stop_status=JobStatus.CANCELED)
     except Exception as e:
         detect_logger().exception(e)
     finally:
         detect_logger().info(f"finish detect {count} running task")