def run_do(self): try: running_tasks = job_utils.query_task(status='running', run_ip=get_lan_ip()) stop_job_ids = set() # detect_logger.info('start to detect running job..') for task in running_tasks: try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger.info( 'job {} component {} on {} {} task {} {} process does not exist' .format(task.f_job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_task_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger.exception(e) if stop_job_ids: schedule_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) for job_id in stop_job_ids: jobs = job_utils.query_job(job_id=job_id) if jobs: initiator_party_id = jobs[0].f_initiator_party_id job_work_mode = jobs[0].f_work_mode if len(jobs) > 1: # i am initiator my_party_id = initiator_party_id else: my_party_id = jobs[0].f_party_id initiator_party_id = jobs[0].f_initiator_party_id api_utils.federated_api( job_id=job_id, method='POST', endpoint='/{}/job/stop'.format(API_VERSION), src_party_id=my_party_id, dest_party_id=initiator_party_id, src_role=None, json_body={ 'job_id': job_id, 'operate': 'kill' }, work_mode=job_work_mode) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=json_loads( jobs[0].f_runtime_conf), stop=True) except Exception as e: detect_logger.exception(e) finally: detect_logger.info('finish detect running job')
def is_alive(task): process_exist = True try: linkis_query_url = "http://{}:{}{}".format( LINKIS_SPARK_CONFIG.get("host"), LINKIS_SPARK_CONFIG.get("port"), LINKIS_QUERT_STATUS.replace("execID", task.f_engine_conf.get("execID"))) headers = task.f_engine_conf["headers"] response = requests.get(linkis_query_url, headers=headers).json() detect_logger.info(response) if response.get("data").get("status") == LinkisJobStatus.FAILED: process_exist = False except Exception as e: detect_logger.exception(e) process_exist = False return process_exist
def run_do(self): try: running_tasks = job_utils.query_task(status='running', run_ip=get_lan_ip()) stop_job_ids = set() detect_logger.info('start to detect running job..') for task in running_tasks: try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger.info( 'job {} component {} on {} {} task {} {} process does not exist' .format(task.f_job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_task_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger.exception(e) if stop_job_ids: schedule_logger.info( 'start to stop jobs: {}'.format(stop_job_ids)) for job_id in stop_job_ids: jobs = job_utils.query_job(job_id=job_id) if jobs: initiator_party_id = jobs[0].f_initiator_party_id job_work_mode = jobs[0].f_work_mode if len(jobs) > 1: # i am initiator my_party_id = initiator_party_id else: my_party_id = jobs[0].f_party_id initiator_party_id = jobs[0].f_initiator_party_id api_utils.local_api( method='POST', endpoint='/{}/job/stop'.format(API_VERSION), json_body={'job_id': job_id}) schedule_logger.info( 'send stop job {} command'.format(job_id)) except Exception as e: detect_logger.exception(e) finally: detect_logger.info('finish detect running job')