def run(self): time.sleep(5) jobs = job_utils.query_job(status='running', is_initiator=1) job_ids = set([job.f_job_id for job in jobs]) for job_id in job_ids: schedule_logger(job_id).info('fate flow server start clean job') TaskScheduler.stop(job_id, JobStatus.FAILED)
def save_pipeline(job_id, role, party_id, model_id, model_version): schedule_logger(job_id).info( 'job {} on {} {} start to save pipeline'.format( job_id, role, party_id)) job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=role, party_id=party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) job_type = job_parameters.get('job_type', '') if job_type == 'predict': return dag = job_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) predict_dsl = dag.get_predict_dsl(role=role) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_dsl, byte=True) pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version job_tracker = Tracking(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version) job_tracker.save_pipeline(pipelined_buffer_object=pipeline) schedule_logger(job_id).info( 'job {} on {} {} save pipeline successfully'.format( job_id, role, party_id))
def mediation_queue_put_events(queue): n = queue.qsize(status=5) stat_logger.info('start check mediation queue, total num {}'.format(n)) for i in range(n): event = queue.get_event(status=5) try: TaskScheduler.cancel_ready(event['job_id'], event['initiator_role'], event['initiator_party_id']) is_failed = queue.put_event(event, job_id=event['job_id'], status=1) schedule_logger(event['job_id']).info( 'job into queue_1 status is {}'.format( 'success' if not is_failed else 'failed')) if is_failed: schedule_logger(event['job_id']).info('start to cancel job') TaskScheduler.stop(job_id=event['job_id'], end_status=JobStatus.CANCELED) except Exception as e: schedule_logger(event['job_id']).error(e) try: schedule_logger(event['job_id']).info('start cancel job') TaskScheduler.stop(job_id=event['job_id'], end_status=JobStatus.CANCELED) except: schedule_logger(event['job_id']).info('cancel job failed')
def sync_task_status(job_id, component_name, task_id, role, party_id, initiator_party_id, initiator_role, task_info, update=False): sync_success = True for dest_party_id in {party_id, initiator_party_id}: if party_id != initiator_party_id and dest_party_id == initiator_party_id: # do not pass the process id to the initiator task_info['f_run_ip'] = '' response = federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/status'.format( API_VERSION, job_id, component_name, task_id, role, party_id), src_party_id=party_id, dest_party_id=dest_party_id, src_role=role, json_body=task_info, work_mode=RuntimeConfig.WORK_MODE) if response['retcode']: sync_success = False schedule_logger().exception('job {} role {} party {} synchronize task status failed'.format(job_id, role, party_id)) break if not sync_success and not update: task_info['f_status'] = TaskStatus.FAILED TaskExecutor.sync_task_status(job_id, component_name, task_id, role, party_id, initiator_party_id, initiator_role, task_info, update=True) if update: raise Exception('job {} role {} party {} synchronize task status failed'.format(job_id, role, party_id))
def run(): parser = argparse.ArgumentParser() parser.add_argument('-j', '--job_id', required=True, type=str, help="job id") parser.add_argument('-w', '--work_mode', required=True, type=str, help="work mode") parser.add_argument('-b', '--backend', required=True, type=str, help="backend") args = parser.parse_args() job_id = args.job_id work_mode = int(args.work_mode) backend = int(args.backend) session.init(job_id=job_id, mode=work_mode, backend=backend, set_log_dir=False) try: schedule_logger(job_id.split('_')[0]).info( 'start stop session {}'.format(session.get_session_id())) session.stop() schedule_logger(job_id.split('_')[0]).info( 'stop session {} success'.format(session.get_session_id())) except Exception as e: pass
def start_stop(job_id, operate=None): schedule_logger(job_id).info('get {} job {} command'.format( 'stop', job_id)) jobs = job_utils.query_job(job_id=job_id, is_initiator=1) if not jobs: jobs = job_utils.query_job(job_id=job_id) if jobs: job_info = {'job_id': job_id} if operate: job_info['operate'] = operate job_work_mode = jobs[0].f_work_mode initiator_party_id = jobs[0].f_initiator_party_id response = federated_api( job_id=job_id, method='POST', endpoint='/{}/job/stop/do'.format(API_VERSION), src_party_id=initiator_party_id, dest_party_id=initiator_party_id, src_role=None, json_body=job_info, work_mode=job_work_mode) return response else: schedule_logger(job_id).info( 'send {} job stop command failed, no find this job'.format( job_id)) raise Exception('can not found job: {}'.format(job_id))
def clean_task(self, roles, party_ids): schedule_logger(self.job_id).info('clean task {} on {} {}'.format(self.task_id, self.role, self.party_id)) try: for role in roles.split(','): for party_id in party_ids.split(','): # clean up temporary tables namespace_clean = job_utils.generate_session_id(task_id=self.task_id, role=role, party_id=party_id) session.clean_tables(namespace=namespace_clean, regex_string='*') schedule_logger(self.job_id).info('clean table by namespace {} on {} {} done'.format(namespace_clean, self.role, self.party_id)) # clean up the last tables of the federation namespace_clean = self.task_id session.clean_tables(namespace=namespace_clean, regex_string='*') schedule_logger(self.job_id).info('clean table by namespace {} on {} {} done'.format(namespace_clean, self.role, self.party_id)) except Exception as e: schedule_logger(self.job_id).exception(e) schedule_logger(self.job_id).info('clean task {} on {} {} done'.format(self.task_id, self.role, self.party_id))
def insert_data_to_db(self, metric_namespace: str, metric_name: str, data_type: int, kv, job_level=False): with DB.connection_context(): try: tracking_metric = TrackingMetric.model(table_index=self.job_id) tracking_metric.f_job_id = self.job_id tracking_metric.f_component_name = self.component_name if not job_level else 'dag' tracking_metric.f_task_id = self.task_id tracking_metric.f_role = self.role tracking_metric.f_party_id = self.party_id tracking_metric.f_metric_namespace = metric_namespace tracking_metric.f_metric_name = metric_name tracking_metric.f_type = data_type default_db_source = tracking_metric.to_json() tracking_metric_data_source = [] for k, v in kv: db_source = default_db_source.copy() db_source['f_key'] = serialize_b64(k) db_source['f_value'] = serialize_b64(v) db_source['f_create_time'] = current_timestamp() tracking_metric_data_source.append(db_source) self.bulk_insert_model_data( TrackingMetric.model(table_index=self.get_table_index()), tracking_metric_data_source) except Exception as e: schedule_logger(self.job_id).exception(e)
def check_dependencies(job_id, dag, component): role, party_id = job_utils.query_job_info(job_id) dependencies = dag.get_dependency(role=role, party_id=int(party_id)).get( 'dependencies', {}) if not dependencies: return False dependent_component_names = dependencies.get(component.get_name(), []) schedule_logger(job_id).info( 'job {} component {} all dependent component: {}'.format( job_id, component.get_name(), dependent_component_names)) for dependent_component in dependent_component_names: dependent_component_name = dependent_component["component_name"] dependent_component = dag.get_component_info( dependent_component_name) dependent_component_task_status = TaskScheduler.check_task_status( job_id, dependent_component) schedule_logger(job_id).info( 'job {} component {} dependency {} status is {}'.format( job_id, component.get_name(), dependent_component_name, dependent_component_task_status)) if not dependent_component_task_status: # dependency component run failed, break return False else: return True
def kill_task_executor_process(task: Task, only_child=False): try: pid = int(task.f_run_pid) if not pid: return False schedule_logger(task.f_job_id).info( "try to stop job {} task {} {} {} process pid:{}".format( task.f_job_id, task.f_task_id, task.f_role, task.f_party_id, pid)) if not check_job_process(pid): return True p = psutil.Process(int(pid)) if not is_task_executor_process(task=task, process=p): schedule_logger(task.f_job_id).warning( "this pid is not task executor: {}".format(" ".join( p.cmdline()))) return False for child in p.children(recursive=True): if check_job_process(child.pid) and is_task_executor_process( task=task, process=child): child.kill() if not only_child: if check_job_process(p.pid) and is_task_executor_process( task=task, process=p): p.kill() return True except Exception as e: raise e
def check_task_status(job_id, component, interval=0.5): task_id = job_utils.generate_task_id(job_id=job_id, component_name=component.get_name()) while True: try: status_collect = set() parameters = component.get_role_parameters() for _role, _partys_parameters in parameters.items(): for _party_parameters in _partys_parameters: _party_id = _party_parameters.get('local', {}).get('party_id') tasks = query_task(job_id=job_id, task_id=task_id, role=_role, party_id=_party_id) if tasks: task_status = tasks[0].f_status else: task_status = 'notRunning' schedule_logger(job_id).info( 'job {} component {} run on {} {} status is {}'.format(job_id, component.get_name(), _role, _party_id, task_status)) status_collect.add(task_status) if 'failed' in status_collect: return False if 'timeout' in status_collect: return None elif len(status_collect) == 1 and TaskStatus.COMPLETE in status_collect: return True else: time.sleep(interval) except Exception as e: schedule_logger(job_id).exception(e) return False
def cancel_job(job_id, role, party_id, job_initiator): schedule_logger(job_id).info( '{} {} get cancel waiting job {} command'.format( role, party_id, job_id)) jobs = job_utils.query_job(job_id=job_id, is_initiator=1) if jobs: job = jobs[0] job_runtime_conf = json_loads(job.f_runtime_conf) event = job_utils.job_event( job.f_job_id, job_runtime_conf['initiator']['role'], job_runtime_conf['initiator']['party_id']) try: RuntimeConfig.JOB_QUEUE.del_event(event) except: return False schedule_logger(job_id).info( 'cancel waiting job successfully, job id is {}'.format( job.f_job_id)) return True else: jobs = job_utils.query_job(job_id=job_id) if jobs: raise Exception( 'role {} party id {} cancel waiting job {} failed, not is initiator' .format(role, party_id, job_id)) raise Exception( 'role {} party id {} cancel waiting job failed, no find jod {}' .format(role, party_id, job_id))
def save_metric_data(self, metric_namespace: str, metric_name: str, metrics: List[Metric], job_level=False): schedule_logger(self.job_id).info( 'save job {} component {} on {} {} {} {} metric data'.format(self.job_id, self.component_name, self.role, self.party_id, metric_namespace, metric_name)) kv = [] for metric in metrics: kv.append((metric.key, metric.value)) self.insert_data_to_db(metric_namespace, metric_name, 1, kv, job_level)
def save_job_info(self, role, party_id, job_info, create=False): with DB.connection_context(): schedule_logger(self.job_id).info('save {} {} job: {}'.format( role, party_id, job_info)) jobs = Job.select().where(Job.f_job_id == self.job_id, Job.f_role == role, Job.f_party_id == party_id) is_insert = True if jobs: job = jobs[0] is_insert = False if job.f_status == JobStatus.TIMEOUT: return None elif create: job = Job() job.f_create_time = current_timestamp() else: return None job.f_job_id = self.job_id job.f_role = role job.f_party_id = party_id if 'f_status' in job_info: if job.f_status in [JobStatus.COMPLETE, JobStatus.FAILED]: # Termination status cannot be updated # TODO: return if (job_info['f_status'] in [ JobStatus.FAILED, JobStatus.TIMEOUT ]) and (not job.f_end_time): if not job.f_start_time: return job_info['f_end_time'] = current_timestamp() job_info['f_elapsed'] = job_info[ 'f_end_time'] - job.f_start_time job_info['f_update_time'] = current_timestamp() if (job_info['f_status'] in [ JobStatus.FAILED, JobStatus.TIMEOUT, JobStatus.CANCELED, JobStatus.COMPLETE ]): job_info['f_tag'] = 'job_end' update_fields = [] for k, v in job_info.items(): try: if k in ['f_job_id', 'f_role', 'f_party_id' ] or v == getattr(Job, k).default: continue setattr(job, k, v) update_fields.append(getattr(Job, k)) except: pass if is_insert: job.save(force_insert=True) else: job.save(only=update_fields)
def save_metric_meta(self, metric_namespace: str, metric_name: str, metric_meta: MetricMeta, job_level: bool = False): schedule_logger(self.job_id).info( 'save job {} component {} on {} {} {} {} metric meta'.format( self.job_id, self.component_name, self.role, self.party_id, metric_namespace, metric_name)) self.insert_data_to_db(metric_namespace, metric_name, 0, metric_meta.to_dict().items(), job_level)
def bulk_insert_model_data(self, model, data_source): with DB.connection_context(): try: DB.create_tables([model]) batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000 for i in range(0, len(data_source), batch_size): with DB.atomic(): model.insert_many(data_source[i:i+batch_size]).execute() return len(data_source) except Exception as e: schedule_logger(self.job_id).exception(e) return 0
def update_task_status(job_id, component_name, task_id, role, party_id, task_info): tracker = Tracking(job_id=job_id, role=role, party_id=party_id, component_name=component_name, task_id=task_id) tracker.save_task(role=role, party_id=party_id, task_info=task_info) schedule_logger(job_id).info( 'job {} component {} {} {} status {}'.format( job_id, component_name, role, party_id, task_info.get('f_status', '')))
def run_do(self): try: running_tasks = job_utils.query_task(status='running', run_ip=get_lan_ip()) stop_job_ids = set() # detect_logger.info('start to detect running job..') for task in running_tasks: try: process_exist = job_utils.check_job_process( int(task.f_run_pid)) if not process_exist: detect_logger.info( 'job {} component {} on {} {} task {} {} process does not exist' .format(task.f_job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_task_id, task.f_run_pid)) stop_job_ids.add(task.f_job_id) except Exception as e: detect_logger.exception(e) if stop_job_ids: schedule_logger().info( 'start to stop jobs: {}'.format(stop_job_ids)) for job_id in stop_job_ids: jobs = job_utils.query_job(job_id=job_id) if jobs: initiator_party_id = jobs[0].f_initiator_party_id job_work_mode = jobs[0].f_work_mode if len(jobs) > 1: # i am initiator my_party_id = initiator_party_id else: my_party_id = jobs[0].f_party_id initiator_party_id = jobs[0].f_initiator_party_id api_utils.federated_api( job_id=job_id, method='POST', endpoint='/{}/job/stop'.format(API_VERSION), src_party_id=my_party_id, dest_party_id=initiator_party_id, src_role=None, json_body={ 'job_id': job_id, 'operate': 'kill' }, work_mode=job_work_mode) TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=json_loads( jobs[0].f_runtime_conf), stop=True) except Exception as e: detect_logger.exception(e) finally: detect_logger.info('finish detect running job')
def save_output_data_table(self, data_table: Table, data_name: str = 'component'): """ Save component output data, will run in the task executor process :param data_table: :param data_name: :return: """ if data_table: persistent_table_namespace, persistent_table_name = 'output_data_{}'.format( self.task_id), data_table.get_name() schedule_logger(self.job_id).info( 'persisting the component output temporary table: {} {} to {} {}' .format(data_table.get_namespace(), data_table.get_name(), persistent_table_namespace, persistent_table_name)) persistent_table = data_table.save_as( namespace=persistent_table_namespace, name=persistent_table_name) persistent_table_metas = {} persistent_table_metas.update(data_table.get_metas()) persistent_table_metas["schema"] = data_table.schema session.save_data_table_meta( persistent_table_metas, data_table_namespace=persistent_table.get_namespace(), data_table_name=persistent_table.get_name()) data_table_info = { data_name: { 'name': persistent_table.get_name(), 'namespace': persistent_table.get_namespace() } } else: data_table_info = {} session.save_data(data_table_info.items(), name=Tracking.output_table_name('data'), namespace=self.table_namespace, partition=48) self.save_data_view( self.role, self.party_id, data_info={ 'f_table_name': persistent_table._name if data_table else '', 'f_table_namespace': persistent_table._namespace if data_table else '', 'f_partition': persistent_table._partitions if data_table else None, 'f_table_count_actual': data_table.count() if data_table else 0 }, mark=True)
def clean_job(job_id,role, party_id, roles, party_ids): schedule_logger(job_id).info('job {} on {} {} start to clean'.format(job_id, role, party_id)) tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id) for task in tasks: try: Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task(roles, party_ids) schedule_logger(job_id).info( 'job {} component {} on {} {} clean done'.format(job_id, task.f_component_name, role, party_id)) except Exception as e: schedule_logger(job_id).info( 'job {} component {} on {} {} clean failed'.format(job_id, task.f_component_name, role, party_id)) schedule_logger(job_id).exception(e) schedule_logger(job_id).info('job {} on {} {} clean done'.format(job_id, role, party_id))
def kill_job(job_id, role, party_id, job_initiator, timeout=False): schedule_logger(job_id).info('{} {} get kill job {} command'.format(role, party_id, job_id)) tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id) for task in tasks: kill_status = False try: kill_status = job_utils.kill_process(int(task.f_run_pid)) except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT if task.f_status != TaskStatus.SUCCESS: task.f_status = status try: TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json(), initiator_role=job_initiator.get('role', None)) except Exception as e: schedule_logger(job_id).exception(e)
def kill_job(job_id, role, party_id, job_initiator, timeout=False, component_name=''): schedule_logger(job_id).info('{} {} get kill job {} {} command'.format(role, party_id, job_id, component_name)) task_info = job_utils.get_task_info(job_id, role, party_id, component_name) tasks = job_utils.query_task(**task_info) job = job_utils.query_job(job_id=job_id) for task in tasks: kill_status = False try: # task clean up runtime_conf = json_loads(job[0].f_runtime_conf) roles = ','.join(runtime_conf['role'].keys()) party_ids = ','.join([','.join([str(j) for j in i]) for i in runtime_conf['role'].values()]) # Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task(roles, party_ids) # stop task kill_status = job_utils.kill_task_executor_process(task) # session stop job_utils.start_session_stop(task) except Exception as e: schedule_logger(job_id).exception(e) finally: schedule_logger(job_id).info( 'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role, task.f_party_id, task.f_run_pid, 'success' if kill_status else 'failed')) status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT if task.f_status != TaskStatus.COMPLETE: task.f_status = status try: TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id, role=role, party_id=party_id, initiator_party_id=job_initiator.get('party_id', None), task_info=task.to_json(), initiator_role=job_initiator.get('role', None)) except Exception as e: schedule_logger(job_id).exception(e)
def clean_queue(): schedule_logger().info('get clean queue command') jobs = job_utils.query_job(is_initiator=1, status=JobStatus.WAITING) if jobs: for job in jobs: schedule_logger(job.f_job_id).info( 'start send {} job {} command success'.format(JobStatus.CANCELED, job.f_job_id)) job_info = {'f_job_id': job.f_job_id, 'f_status': JobStatus.CANCELED} roles = json_loads(job.f_roles) job_work_mode = job.f_work_mode initiator_party_id = job.f_party_id TaskScheduler.sync_job_status(job_id=job.f_job_id, roles=roles, initiator_party_id=initiator_party_id, initiator_role=job.f_role, work_mode=job_work_mode, job_info=job_info) job_runtime_conf = json_loads(job.f_runtime_conf) event = job_utils.job_event(job.f_job_id, job_runtime_conf['initiator']['role'], job_runtime_conf['initiator']['party_id']) try: RuntimeConfig.JOB_QUEUE.del_event(event) schedule_logger(job.f_job_id).info( 'send {} job {} command success'.format(JobStatus.CANCELED, job.f_job_id)) except Exception as e: schedule_logger(job.f_job_id).error(e) else: raise Exception('There are no jobs in the queue')
def read_data_from_db(self, metric_namespace: str, metric_name: str, data_type, job_level=False): with DB.connection_context(): metrics = [] try: query_sql = 'select f_key, f_value from t_tracking_metric_{} where ' \ 'f_job_id = "{}" and f_component_name = "{}" and f_role = "{}" and f_party_id = "{}"' \ 'and f_task_id = "{}" and f_metric_namespace = "{}" and f_metric_name= "{}" and f_type="{}" order by f_id'.format( self.get_table_index(), self.job_id, self.component_name if not job_level else 'dag', self.role, self.party_id, self.task_id, metric_namespace, metric_name, data_type) cursor = DB.execute_sql(query_sql) for row in cursor.fetchall(): yield deserialize_b64(row[0]), deserialize_b64(row[1]) except Exception as e: schedule_logger(self.job_id).exception(e) return metrics
def stop(job_id, end_status=JobStatus.FAILED, component_name=''): schedule_logger(job_id).info('get {} job {} {} command'.format("cancel" if end_status == JobStatus.CANCELED else "stop", job_id, component_name)) jobs = job_utils.query_job(job_id=job_id, is_initiator=1) cancel_success = False is_cancel = (end_status == JobStatus.CANCELED) if jobs: initiator_job = jobs[0] job_info = {'f_job_id': job_id, 'f_status': end_status} roles = json_loads(initiator_job.f_roles) job_work_mode = initiator_job.f_work_mode initiator_party_id = initiator_job.f_party_id # set status first if not component_name: TaskScheduler.sync_job_status(job_id=job_id, roles=roles, initiator_party_id=initiator_party_id, initiator_role=initiator_job.f_role, work_mode=job_work_mode, job_info=job_info) for role, partys in roles.items(): for party_id in partys: response = federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}'.format( API_VERSION, job_id, role, party_id, "cancel" if is_cancel else "kill" ), src_party_id=initiator_party_id, dest_party_id=party_id, src_role=initiator_job.f_role, json_body={'job_initiator': {'party_id': initiator_job.f_party_id, 'role': initiator_job.f_role}, 'timeout': end_status == JobStatus.TIMEOUT, 'component_name': component_name }, work_mode=job_work_mode) if response['retcode'] == 0: cancel_success = True schedule_logger(job_id).info( 'send {} {} {} job {} {} command successfully'.format(role, party_id, "cancel" if is_cancel else "kill", job_id, component_name)) if is_cancel: break else: schedule_logger(job_id).info( 'send {} {} {} job {} {} command failed: {}'.format(role, party_id, "cancel" if is_cancel else "kill", job_id, component_name, response['retmsg'])) if is_cancel: return cancel_success else: jobs = job_utils.query_job(job_id=job_id) if jobs: raise Exception('Current role is not this job initiator') schedule_logger(job_id).info('send {} job {} {} command failed'.format("cancel" if is_cancel else "kill", job_id, component_name)) raise Exception('can not found job: {}'.format(job_id))
def finish_job(job_id, job_runtime_conf, stop=False): job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] model_id_base64 = base64_encode(job_parameters['model_id']) model_version_base64 = base64_encode(job_parameters['model_version']) roles = ','.join(job_runtime_conf['role'].keys()) party_ids = ','.join([','.join([str(j) for j in i]) for i in job_runtime_conf['role'].values()]) for role, partys in job_runtime_conf['role'].items(): for party_id in partys: # save pipeline if not stop: federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/save/pipeline'.format( API_VERSION, job_id, role, party_id, model_id_base64, model_version_base64 ), src_party_id=job_initiator['party_id'], dest_party_id=party_id, src_role=job_initiator['role'], json_body={}, work_mode=job_parameters['work_mode']) # clean federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/clean'.format( API_VERSION, job_id, role, party_id, roles, party_ids ), src_party_id=job_initiator['party_id'], dest_party_id=party_id, src_role=job_initiator['role'], json_body={}, work_mode=job_parameters['work_mode']) schedule_logger(job_id, delete=True)
def start_session_stop(task): job_conf_dict = get_job_conf(task.f_job_id) runtime_conf = job_conf_dict['job_runtime_conf_path'] process_cmd = [ 'python3', sys.modules[session_utils.SessionStop.__module__].__file__, '-j', '{}_{}_{}'.format(task.f_task_id, task.f_role, task.f_party_id), '-w', str(runtime_conf.get('job_parameters').get('work_mode')), '-b', str(runtime_conf.get('job_parameters').get('backend', 0)), '-c', 'stop' if task.f_status == TaskStatus.COMPLETE else 'kill' ] schedule_logger(task.f_job_id).info( 'start run subprocess to stop component {} session'.format( task.f_component_name)) task_dir = os.path.join(get_job_directory(job_id=task.f_job_id), task.f_role, task.f_party_id, task.f_component_name, 'session_stop') os.makedirs(task_dir, exist_ok=True) p = run_subprocess(config_dir=task_dir, process_cmd=process_cmd, log_dir=None)
def is_task_executor_process(task: Task, process: psutil.Process): """ check the process if task executor or not by command :param task: :param process: :return: """ # Todo: The same map should be used for run task command run_cmd_map = { 3: "f_job_id", 5: "f_component_name", 7: "f_task_id", 9: "f_role", 11: "f_party_id" } try: cmdline = process.cmdline() except Exception as e: # Not sure whether the process is a task executor process, operations processing is required schedule_logger(task.f_job_id).warning(e) return False for i, k in run_cmd_map.items(): if len(cmdline) > i and cmdline[i] == getattr(task, k): continue else: # todo: The logging level should be obtained first if len(cmdline) > i: schedule_logger(task.f_job_id).debug(cmdline[i]) schedule_logger(task.f_job_id).debug(getattr(task, k)) return False else: return True
def save_metric_meta_remote(self, metric_namespace: str, metric_name: str, metric_meta: MetricMeta, job_level: bool = False): # TODO: In the next version will be moved to tracking api module on arch/api package schedule_logger(self.job_id).info( 'request save job {} component {} on {} {} {} {} metric meta'. format(self.job_id, self.component_name, self.role, self.party_id, metric_namespace, metric_name)) request_body = dict() request_body['metric_namespace'] = metric_namespace request_body['metric_name'] = metric_name request_body['metric_meta'] = serialize_b64(metric_meta, to_str=True) request_body['job_level'] = job_level response = api_utils.local_api( method='POST', endpoint='/{}/tracking/{}/{}/{}/{}/{}/metric_meta/save'.format( API_VERSION, self.job_id, self.component_name, self.task_id, self.role, self.party_id), json_body=request_body) return response['retcode'] == 0
def finish_job(job_id, job_runtime_conf): job_parameters = job_runtime_conf['job_parameters'] job_initiator = job_runtime_conf['initiator'] model_id_base64 = base64_encode(job_parameters['model_id']) model_version_base64 = base64_encode(job_parameters['model_version']) for role, partys in job_runtime_conf['role'].items(): for party_id in partys: # save pipeline federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/{}/{}/save/pipeline'.format( API_VERSION, job_id, role, party_id, model_id_base64, model_version_base64 ), src_party_id=job_initiator['party_id'], dest_party_id=party_id, src_role=job_initiator['role'], json_body={}, work_mode=job_parameters['work_mode']) # clean """ federated_api(job_id=job_id, method='POST', endpoint='/{}/schedule/{}/{}/{}/clean'.format( API_VERSION, job_id, role, party_id), src_party_id=job_initiator['party_id'], dest_party_id=party_id, src_role=job_initiator['role'], json_body={}, work_mode=job_parameters['work_mode']) """ schedule_logger(job_id, delete=True)