Пример #1
0
 def run(self):
     time.sleep(5)
     jobs = job_utils.query_job(status='running', is_initiator=1)
     job_ids = set([job.f_job_id for job in jobs])
     for job_id in job_ids:
         schedule_logger(job_id).info('fate flow server start clean job')
         TaskScheduler.stop(job_id, JobStatus.FAILED)
Пример #2
0
 def save_pipeline(job_id, role, party_id, model_id, model_version):
     schedule_logger(job_id).info(
         'job {} on {} {} start to save pipeline'.format(
             job_id, role, party_id))
     job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(
         job_id=job_id, role=role, party_id=party_id)
     job_parameters = job_runtime_conf.get('job_parameters', {})
     job_type = job_parameters.get('job_type', '')
     if job_type == 'predict':
         return
     dag = job_utils.get_job_dsl_parser(
         dsl=job_dsl,
         runtime_conf=job_runtime_conf,
         train_runtime_conf=train_runtime_conf)
     predict_dsl = dag.get_predict_dsl(role=role)
     pipeline = pipeline_pb2.Pipeline()
     pipeline.inference_dsl = json_dumps(predict_dsl, byte=True)
     pipeline.train_dsl = json_dumps(job_dsl, byte=True)
     pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True)
     pipeline.fate_version = RuntimeConfig.get_env("FATE")
     pipeline.model_id = model_id
     pipeline.model_version = model_version
     job_tracker = Tracking(job_id=job_id,
                            role=role,
                            party_id=party_id,
                            model_id=model_id,
                            model_version=model_version)
     job_tracker.save_pipeline(pipelined_buffer_object=pipeline)
     schedule_logger(job_id).info(
         'job {} on {} {} save pipeline successfully'.format(
             job_id, role, party_id))
Пример #3
0
def mediation_queue_put_events(queue):
    n = queue.qsize(status=5)
    stat_logger.info('start check mediation queue, total num {}'.format(n))
    for i in range(n):
        event = queue.get_event(status=5)
        try:
            TaskScheduler.cancel_ready(event['job_id'],
                                       event['initiator_role'],
                                       event['initiator_party_id'])
            is_failed = queue.put_event(event,
                                        job_id=event['job_id'],
                                        status=1)
            schedule_logger(event['job_id']).info(
                'job into queue_1 status is {}'.format(
                    'success' if not is_failed else 'failed'))
            if is_failed:
                schedule_logger(event['job_id']).info('start to cancel job')
                TaskScheduler.stop(job_id=event['job_id'],
                                   end_status=JobStatus.CANCELED)
        except Exception as e:
            schedule_logger(event['job_id']).error(e)
            try:
                schedule_logger(event['job_id']).info('start cancel job')
                TaskScheduler.stop(job_id=event['job_id'],
                                   end_status=JobStatus.CANCELED)
            except:
                schedule_logger(event['job_id']).info('cancel job failed')
Пример #4
0
 def sync_task_status(job_id, component_name, task_id, role, party_id, initiator_party_id, initiator_role, task_info, update=False):
     sync_success = True
     for dest_party_id in {party_id, initiator_party_id}:
         if party_id != initiator_party_id and dest_party_id == initiator_party_id:
             # do not pass the process id to the initiator
             task_info['f_run_ip'] = ''
         response = federated_api(job_id=job_id,
                                  method='POST',
                                  endpoint='/{}/schedule/{}/{}/{}/{}/{}/status'.format(
                                      API_VERSION,
                                      job_id,
                                      component_name,
                                      task_id,
                                      role,
                                      party_id),
                                  src_party_id=party_id,
                                  dest_party_id=dest_party_id,
                                  src_role=role,
                                  json_body=task_info,
                                  work_mode=RuntimeConfig.WORK_MODE)
         if response['retcode']:
             sync_success = False
             schedule_logger().exception('job {} role {} party {} synchronize task status failed'.format(job_id, role, party_id))
             break
     if not sync_success and not update:
         task_info['f_status'] = TaskStatus.FAILED
         TaskExecutor.sync_task_status(job_id, component_name, task_id, role, party_id, initiator_party_id,
                                       initiator_role, task_info, update=True)
     if update:
         raise Exception('job {} role {} party {} synchronize task status failed'.format(job_id, role, party_id))
Пример #5
0
 def run():
     parser = argparse.ArgumentParser()
     parser.add_argument('-j',
                         '--job_id',
                         required=True,
                         type=str,
                         help="job id")
     parser.add_argument('-w',
                         '--work_mode',
                         required=True,
                         type=str,
                         help="work mode")
     parser.add_argument('-b',
                         '--backend',
                         required=True,
                         type=str,
                         help="backend")
     args = parser.parse_args()
     job_id = args.job_id
     work_mode = int(args.work_mode)
     backend = int(args.backend)
     session.init(job_id=job_id,
                  mode=work_mode,
                  backend=backend,
                  set_log_dir=False)
     try:
         schedule_logger(job_id.split('_')[0]).info(
             'start stop session {}'.format(session.get_session_id()))
         session.stop()
         schedule_logger(job_id.split('_')[0]).info(
             'stop session {} success'.format(session.get_session_id()))
     except Exception as e:
         pass
Пример #6
0
 def start_stop(job_id, operate=None):
     schedule_logger(job_id).info('get {} job {} command'.format(
         'stop', job_id))
     jobs = job_utils.query_job(job_id=job_id, is_initiator=1)
     if not jobs:
         jobs = job_utils.query_job(job_id=job_id)
     if jobs:
         job_info = {'job_id': job_id}
         if operate:
             job_info['operate'] = operate
         job_work_mode = jobs[0].f_work_mode
         initiator_party_id = jobs[0].f_initiator_party_id
         response = federated_api(
             job_id=job_id,
             method='POST',
             endpoint='/{}/job/stop/do'.format(API_VERSION),
             src_party_id=initiator_party_id,
             dest_party_id=initiator_party_id,
             src_role=None,
             json_body=job_info,
             work_mode=job_work_mode)
         return response
     else:
         schedule_logger(job_id).info(
             'send {} job stop command failed, no find this job'.format(
                 job_id))
         raise Exception('can not found job: {}'.format(job_id))
Пример #7
0
 def clean_task(self, roles, party_ids):
     schedule_logger(self.job_id).info('clean task {} on {} {}'.format(self.task_id,
                                                                       self.role,
                                                                       self.party_id))
     try:
         for role in roles.split(','):
             for party_id in party_ids.split(','):
                 # clean up temporary tables
                 namespace_clean = job_utils.generate_session_id(task_id=self.task_id,
                                                                 role=role,
                                                                 party_id=party_id)
                 session.clean_tables(namespace=namespace_clean, regex_string='*')
                 schedule_logger(self.job_id).info('clean table by namespace {} on {} {} done'.format(namespace_clean,
                                                                                                      self.role,
                                                                                                      self.party_id))
                 # clean up the last tables of the federation
                 namespace_clean = self.task_id
                 session.clean_tables(namespace=namespace_clean, regex_string='*')
                 schedule_logger(self.job_id).info('clean table by namespace {} on {} {} done'.format(namespace_clean,
                                                                                                      self.role,
                                                                                                      self.party_id))
                 
     except Exception as e:
         schedule_logger(self.job_id).exception(e)
     schedule_logger(self.job_id).info('clean task {} on {} {} done'.format(self.task_id,
                                                                            self.role,
                                                                            self.party_id))
Пример #8
0
 def insert_data_to_db(self,
                       metric_namespace: str,
                       metric_name: str,
                       data_type: int,
                       kv,
                       job_level=False):
     with DB.connection_context():
         try:
             tracking_metric = TrackingMetric.model(table_index=self.job_id)
             tracking_metric.f_job_id = self.job_id
             tracking_metric.f_component_name = self.component_name if not job_level else 'dag'
             tracking_metric.f_task_id = self.task_id
             tracking_metric.f_role = self.role
             tracking_metric.f_party_id = self.party_id
             tracking_metric.f_metric_namespace = metric_namespace
             tracking_metric.f_metric_name = metric_name
             tracking_metric.f_type = data_type
             default_db_source = tracking_metric.to_json()
             tracking_metric_data_source = []
             for k, v in kv:
                 db_source = default_db_source.copy()
                 db_source['f_key'] = serialize_b64(k)
                 db_source['f_value'] = serialize_b64(v)
                 db_source['f_create_time'] = current_timestamp()
                 tracking_metric_data_source.append(db_source)
             self.bulk_insert_model_data(
                 TrackingMetric.model(table_index=self.get_table_index()),
                 tracking_metric_data_source)
         except Exception as e:
             schedule_logger(self.job_id).exception(e)
Пример #9
0
 def check_dependencies(job_id, dag, component):
     role, party_id = job_utils.query_job_info(job_id)
     dependencies = dag.get_dependency(role=role,
                                       party_id=int(party_id)).get(
                                           'dependencies', {})
     if not dependencies:
         return False
     dependent_component_names = dependencies.get(component.get_name(), [])
     schedule_logger(job_id).info(
         'job {} component {} all dependent component: {}'.format(
             job_id, component.get_name(), dependent_component_names))
     for dependent_component in dependent_component_names:
         dependent_component_name = dependent_component["component_name"]
         dependent_component = dag.get_component_info(
             dependent_component_name)
         dependent_component_task_status = TaskScheduler.check_task_status(
             job_id, dependent_component)
         schedule_logger(job_id).info(
             'job {} component {} dependency {} status is {}'.format(
                 job_id, component.get_name(), dependent_component_name,
                 dependent_component_task_status))
         if not dependent_component_task_status:
             # dependency component run failed, break
             return False
     else:
         return True
Пример #10
0
def kill_task_executor_process(task: Task, only_child=False):
    try:
        pid = int(task.f_run_pid)
        if not pid:
            return False
        schedule_logger(task.f_job_id).info(
            "try to stop job {} task {} {} {} process pid:{}".format(
                task.f_job_id, task.f_task_id, task.f_role, task.f_party_id,
                pid))
        if not check_job_process(pid):
            return True
        p = psutil.Process(int(pid))
        if not is_task_executor_process(task=task, process=p):
            schedule_logger(task.f_job_id).warning(
                "this pid is not task executor: {}".format(" ".join(
                    p.cmdline())))
            return False
        for child in p.children(recursive=True):
            if check_job_process(child.pid) and is_task_executor_process(
                    task=task, process=child):
                child.kill()
        if not only_child:
            if check_job_process(p.pid) and is_task_executor_process(
                    task=task, process=p):
                p.kill()
        return True
    except Exception as e:
        raise e
Пример #11
0
 def check_task_status(job_id, component, interval=0.5):
     task_id = job_utils.generate_task_id(job_id=job_id, component_name=component.get_name())
     while True:
         try:
             status_collect = set()
             parameters = component.get_role_parameters()
             for _role, _partys_parameters in parameters.items():
                 for _party_parameters in _partys_parameters:
                     _party_id = _party_parameters.get('local', {}).get('party_id')
                     tasks = query_task(job_id=job_id, task_id=task_id, role=_role, party_id=_party_id)
                     if tasks:
                         task_status = tasks[0].f_status
                     else:
                         task_status = 'notRunning'
                     schedule_logger(job_id).info(
                         'job {} component {} run on {} {} status is {}'.format(job_id, component.get_name(), _role,
                                                                                _party_id, task_status))
                     status_collect.add(task_status)
             if 'failed' in status_collect:
                 return False
             if 'timeout' in status_collect:
                 return None
             elif len(status_collect) == 1 and TaskStatus.COMPLETE in status_collect:
                 return True
             else:
                 time.sleep(interval)
         except Exception as e:
             schedule_logger(job_id).exception(e)
             return False
Пример #12
0
 def cancel_job(job_id, role, party_id, job_initiator):
     schedule_logger(job_id).info(
         '{} {} get cancel waiting job {} command'.format(
             role, party_id, job_id))
     jobs = job_utils.query_job(job_id=job_id, is_initiator=1)
     if jobs:
         job = jobs[0]
         job_runtime_conf = json_loads(job.f_runtime_conf)
         event = job_utils.job_event(
             job.f_job_id, job_runtime_conf['initiator']['role'],
             job_runtime_conf['initiator']['party_id'])
         try:
             RuntimeConfig.JOB_QUEUE.del_event(event)
         except:
             return False
         schedule_logger(job_id).info(
             'cancel waiting job successfully, job id is {}'.format(
                 job.f_job_id))
         return True
     else:
         jobs = job_utils.query_job(job_id=job_id)
         if jobs:
             raise Exception(
                 'role {} party id {} cancel waiting job {} failed, not is initiator'
                 .format(role, party_id, job_id))
         raise Exception(
             'role {} party id {} cancel waiting job failed, no find jod {}'
             .format(role, party_id, job_id))
Пример #13
0
 def save_metric_data(self, metric_namespace: str, metric_name: str, metrics: List[Metric], job_level=False):
     schedule_logger(self.job_id).info(
         'save job {} component {} on {} {} {} {} metric data'.format(self.job_id, self.component_name, self.role,
                                                                      self.party_id, metric_namespace, metric_name))
     kv = []
     for metric in metrics:
         kv.append((metric.key, metric.value))
     self.insert_data_to_db(metric_namespace, metric_name, 1, kv, job_level)
Пример #14
0
    def save_job_info(self, role, party_id, job_info, create=False):
        with DB.connection_context():
            schedule_logger(self.job_id).info('save {} {} job: {}'.format(
                role, party_id, job_info))
            jobs = Job.select().where(Job.f_job_id == self.job_id,
                                      Job.f_role == role,
                                      Job.f_party_id == party_id)
            is_insert = True
            if jobs:
                job = jobs[0]
                is_insert = False
                if job.f_status == JobStatus.TIMEOUT:
                    return None
            elif create:
                job = Job()
                job.f_create_time = current_timestamp()
            else:
                return None
            job.f_job_id = self.job_id
            job.f_role = role
            job.f_party_id = party_id
            if 'f_status' in job_info:
                if job.f_status in [JobStatus.COMPLETE, JobStatus.FAILED]:
                    # Termination status cannot be updated
                    # TODO:
                    return
                if (job_info['f_status'] in [
                        JobStatus.FAILED, JobStatus.TIMEOUT
                ]) and (not job.f_end_time):
                    if not job.f_start_time:
                        return
                    job_info['f_end_time'] = current_timestamp()
                    job_info['f_elapsed'] = job_info[
                        'f_end_time'] - job.f_start_time
                    job_info['f_update_time'] = current_timestamp()

                if (job_info['f_status'] in [
                        JobStatus.FAILED, JobStatus.TIMEOUT,
                        JobStatus.CANCELED, JobStatus.COMPLETE
                ]):
                    job_info['f_tag'] = 'job_end'
            update_fields = []
            for k, v in job_info.items():
                try:
                    if k in ['f_job_id', 'f_role', 'f_party_id'
                             ] or v == getattr(Job, k).default:
                        continue
                    setattr(job, k, v)
                    update_fields.append(getattr(Job, k))
                except:
                    pass

            if is_insert:
                job.save(force_insert=True)
            else:
                job.save(only=update_fields)
Пример #15
0
 def save_metric_meta(self,
                      metric_namespace: str,
                      metric_name: str,
                      metric_meta: MetricMeta,
                      job_level: bool = False):
     schedule_logger(self.job_id).info(
         'save job {} component {} on {} {} {} {} metric meta'.format(
             self.job_id, self.component_name, self.role, self.party_id,
             metric_namespace, metric_name))
     self.insert_data_to_db(metric_namespace, metric_name, 0,
                            metric_meta.to_dict().items(), job_level)
Пример #16
0
 def bulk_insert_model_data(self, model, data_source):
     with DB.connection_context():
         try:
             DB.create_tables([model])
             batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000
             for i in range(0, len(data_source), batch_size):
                 with DB.atomic():
                     model.insert_many(data_source[i:i+batch_size]).execute()
             return len(data_source)
         except Exception as e:
             schedule_logger(self.job_id).exception(e)
             return 0
Пример #17
0
 def update_task_status(job_id, component_name, task_id, role, party_id,
                        task_info):
     tracker = Tracking(job_id=job_id,
                        role=role,
                        party_id=party_id,
                        component_name=component_name,
                        task_id=task_id)
     tracker.save_task(role=role, party_id=party_id, task_info=task_info)
     schedule_logger(job_id).info(
         'job {} component {} {} {} status {}'.format(
             job_id, component_name, role, party_id,
             task_info.get('f_status', '')))
Пример #18
0
 def run_do(self):
     try:
         running_tasks = job_utils.query_task(status='running',
                                              run_ip=get_lan_ip())
         stop_job_ids = set()
         # detect_logger.info('start to detect running job..')
         for task in running_tasks:
             try:
                 process_exist = job_utils.check_job_process(
                     int(task.f_run_pid))
                 if not process_exist:
                     detect_logger.info(
                         'job {} component {} on {} {} task {} {} process does not exist'
                         .format(task.f_job_id, task.f_component_name,
                                 task.f_role, task.f_party_id,
                                 task.f_task_id, task.f_run_pid))
                     stop_job_ids.add(task.f_job_id)
             except Exception as e:
                 detect_logger.exception(e)
         if stop_job_ids:
             schedule_logger().info(
                 'start to stop jobs: {}'.format(stop_job_ids))
         for job_id in stop_job_ids:
             jobs = job_utils.query_job(job_id=job_id)
             if jobs:
                 initiator_party_id = jobs[0].f_initiator_party_id
                 job_work_mode = jobs[0].f_work_mode
                 if len(jobs) > 1:
                     # i am initiator
                     my_party_id = initiator_party_id
                 else:
                     my_party_id = jobs[0].f_party_id
                     initiator_party_id = jobs[0].f_initiator_party_id
                 api_utils.federated_api(
                     job_id=job_id,
                     method='POST',
                     endpoint='/{}/job/stop'.format(API_VERSION),
                     src_party_id=my_party_id,
                     dest_party_id=initiator_party_id,
                     src_role=None,
                     json_body={
                         'job_id': job_id,
                         'operate': 'kill'
                     },
                     work_mode=job_work_mode)
                 TaskScheduler.finish_job(job_id=job_id,
                                          job_runtime_conf=json_loads(
                                              jobs[0].f_runtime_conf),
                                          stop=True)
     except Exception as e:
         detect_logger.exception(e)
     finally:
         detect_logger.info('finish detect running job')
Пример #19
0
 def save_output_data_table(self,
                            data_table: Table,
                            data_name: str = 'component'):
     """
     Save component output data, will run in the task executor process
     :param data_table:
     :param data_name:
     :return:
     """
     if data_table:
         persistent_table_namespace, persistent_table_name = 'output_data_{}'.format(
             self.task_id), data_table.get_name()
         schedule_logger(self.job_id).info(
             'persisting the component output temporary table: {} {} to {} {}'
             .format(data_table.get_namespace(), data_table.get_name(),
                     persistent_table_namespace, persistent_table_name))
         persistent_table = data_table.save_as(
             namespace=persistent_table_namespace,
             name=persistent_table_name)
         persistent_table_metas = {}
         persistent_table_metas.update(data_table.get_metas())
         persistent_table_metas["schema"] = data_table.schema
         session.save_data_table_meta(
             persistent_table_metas,
             data_table_namespace=persistent_table.get_namespace(),
             data_table_name=persistent_table.get_name())
         data_table_info = {
             data_name: {
                 'name': persistent_table.get_name(),
                 'namespace': persistent_table.get_namespace()
             }
         }
     else:
         data_table_info = {}
     session.save_data(data_table_info.items(),
                       name=Tracking.output_table_name('data'),
                       namespace=self.table_namespace,
                       partition=48)
     self.save_data_view(
         self.role,
         self.party_id,
         data_info={
             'f_table_name':
             persistent_table._name if data_table else '',
             'f_table_namespace':
             persistent_table._namespace if data_table else '',
             'f_partition':
             persistent_table._partitions if data_table else None,
             'f_table_count_actual':
             data_table.count() if data_table else 0
         },
         mark=True)
Пример #20
0
 def clean_job(job_id,role, party_id, roles, party_ids):
     schedule_logger(job_id).info('job {} on {} {} start to clean'.format(job_id, role, party_id))
     tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id)
     for task in tasks:
         try:
             Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task(roles, party_ids)
             schedule_logger(job_id).info(
                 'job {} component {} on {} {} clean done'.format(job_id, task.f_component_name, role, party_id))
         except Exception as e:
             schedule_logger(job_id).info(
                 'job {} component {} on {} {} clean failed'.format(job_id, task.f_component_name, role, party_id))
             schedule_logger(job_id).exception(e)
     schedule_logger(job_id).info('job {} on {} {} clean done'.format(job_id, role, party_id))
Пример #21
0
    def kill_job(job_id, role, party_id, job_initiator, timeout=False):
        schedule_logger(job_id).info('{} {} get kill job {} command'.format(role, party_id, job_id))
        tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id)
        for task in tasks:
            kill_status = False
            try:
                kill_status = job_utils.kill_process(int(task.f_run_pid))
            except Exception as e:
                schedule_logger(job_id).exception(e)
            finally:
                schedule_logger(job_id).info(
                    'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role,
                                                                             task.f_party_id, task.f_run_pid,
                                                                             'success' if kill_status else 'failed'))
            status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT

            if task.f_status != TaskStatus.SUCCESS:
                task.f_status = status
            try:
                TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id,
                                              role=role,
                                              party_id=party_id, initiator_party_id=job_initiator.get('party_id', None),
                                              task_info=task.to_json(), initiator_role=job_initiator.get('role', None))
            except Exception as e:
                schedule_logger(job_id).exception(e)
Пример #22
0
    def kill_job(job_id, role, party_id, job_initiator, timeout=False, component_name=''):
        schedule_logger(job_id).info('{} {} get kill job {} {} command'.format(role, party_id, job_id, component_name))
        task_info = job_utils.get_task_info(job_id, role, party_id, component_name)
        tasks = job_utils.query_task(**task_info)
        job = job_utils.query_job(job_id=job_id)
        for task in tasks:
            kill_status = False
            try:
                # task clean up
                runtime_conf = json_loads(job[0].f_runtime_conf)
                roles = ','.join(runtime_conf['role'].keys())
                party_ids = ','.join([','.join([str(j) for j in i]) for i in runtime_conf['role'].values()])
                # Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task(roles, party_ids)
                # stop task
                kill_status = job_utils.kill_task_executor_process(task)
                # session stop
                job_utils.start_session_stop(task)
            except Exception as e:
                schedule_logger(job_id).exception(e)
            finally:
                schedule_logger(job_id).info(
                    'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role,
                                                                             task.f_party_id, task.f_run_pid,
                                                                             'success' if kill_status else 'failed'))
            status = TaskStatus.FAILED if not timeout else TaskStatus.TIMEOUT

            if task.f_status != TaskStatus.COMPLETE:
                task.f_status = status
            try:
                TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id,
                                              role=role,
                                              party_id=party_id, initiator_party_id=job_initiator.get('party_id', None),
                                              task_info=task.to_json(), initiator_role=job_initiator.get('role', None))
            except Exception as e:
                schedule_logger(job_id).exception(e)
Пример #23
0
    def clean_queue():
        schedule_logger().info('get clean queue command')
        jobs = job_utils.query_job(is_initiator=1, status=JobStatus.WAITING)
        if jobs:
            for job in jobs:
                schedule_logger(job.f_job_id).info(
                    'start send {} job {} command success'.format(JobStatus.CANCELED, job.f_job_id))
                job_info = {'f_job_id': job.f_job_id, 'f_status': JobStatus.CANCELED}
                roles = json_loads(job.f_roles)
                job_work_mode = job.f_work_mode
                initiator_party_id = job.f_party_id

                TaskScheduler.sync_job_status(job_id=job.f_job_id, roles=roles, initiator_party_id=initiator_party_id,
                                              initiator_role=job.f_role,
                                              work_mode=job_work_mode,
                                              job_info=job_info)
                job_runtime_conf = json_loads(job.f_runtime_conf)
                event = job_utils.job_event(job.f_job_id,
                                            job_runtime_conf['initiator']['role'],
                                            job_runtime_conf['initiator']['party_id'])
                try:
                    RuntimeConfig.JOB_QUEUE.del_event(event)
                    schedule_logger(job.f_job_id).info(
                        'send {} job {} command success'.format(JobStatus.CANCELED, job.f_job_id))
                except Exception as e:
                    schedule_logger(job.f_job_id).error(e)
        else:
            raise Exception('There are no jobs in the queue')
Пример #24
0
 def read_data_from_db(self, metric_namespace: str, metric_name: str, data_type, job_level=False):
     with DB.connection_context():
         metrics = []
         try:
             query_sql = 'select f_key, f_value from t_tracking_metric_{} where ' \
                         'f_job_id = "{}" and f_component_name = "{}" and f_role = "{}" and f_party_id = "{}"' \
                         'and f_task_id = "{}" and f_metric_namespace = "{}" and f_metric_name= "{}" and f_type="{}" order by f_id'.format(
                 self.get_table_index(), self.job_id, self.component_name if not job_level else 'dag', self.role,
                 self.party_id, self.task_id, metric_namespace, metric_name, data_type)
             cursor = DB.execute_sql(query_sql)
             for row in cursor.fetchall():
                 yield deserialize_b64(row[0]), deserialize_b64(row[1])
         except Exception as e:
             schedule_logger(self.job_id).exception(e)
         return metrics
Пример #25
0
    def stop(job_id, end_status=JobStatus.FAILED, component_name=''):
        schedule_logger(job_id).info('get {} job {} {} command'.format("cancel" if end_status == JobStatus.CANCELED else "stop", job_id, component_name))
        jobs = job_utils.query_job(job_id=job_id, is_initiator=1)
        cancel_success = False
        is_cancel = (end_status == JobStatus.CANCELED)
        if jobs:
            initiator_job = jobs[0]
            job_info = {'f_job_id': job_id, 'f_status': end_status}
            roles = json_loads(initiator_job.f_roles)
            job_work_mode = initiator_job.f_work_mode
            initiator_party_id = initiator_job.f_party_id

            # set status first
            if not component_name:
                TaskScheduler.sync_job_status(job_id=job_id, roles=roles, initiator_party_id=initiator_party_id,
                                              initiator_role=initiator_job.f_role,
                                              work_mode=job_work_mode,
                                              job_info=job_info)
            for role, partys in roles.items():
                for party_id in partys:
                    response = federated_api(job_id=job_id,
                                             method='POST',
                                             endpoint='/{}/schedule/{}/{}/{}/{}'.format(
                                                 API_VERSION,
                                                 job_id,
                                                 role,
                                                 party_id,
                                                 "cancel" if is_cancel else "kill"
                                             ),
                                             src_party_id=initiator_party_id,
                                             dest_party_id=party_id,
                                             src_role=initiator_job.f_role,
                                             json_body={'job_initiator': {'party_id': initiator_job.f_party_id,
                                                                          'role': initiator_job.f_role},
                                                        'timeout': end_status == JobStatus.TIMEOUT,
                                                        'component_name': component_name
                                                        },
                                             work_mode=job_work_mode)
                    if response['retcode'] == 0:
                        cancel_success = True
                        schedule_logger(job_id).info(
                            'send {} {} {} job {} {} command successfully'.format(role, party_id, "cancel" if is_cancel else "kill",
                                                                                  job_id, component_name))
                        if is_cancel:
                            break
                    else:
                        schedule_logger(job_id).info(
                            'send {} {} {} job {} {} command failed: {}'.format(role, party_id, "cancel" if is_cancel else "kill",
                                                                                job_id, component_name, response['retmsg']))
            if is_cancel:
                return cancel_success
        else:
            jobs = job_utils.query_job(job_id=job_id)
            if jobs:
                raise Exception('Current role is not this job initiator')
            schedule_logger(job_id).info('send {} job {} {} command failed'.format("cancel" if is_cancel else "kill", job_id, component_name))
            raise Exception('can not found job: {}'.format(job_id))
Пример #26
0
 def finish_job(job_id, job_runtime_conf, stop=False):
     job_parameters = job_runtime_conf['job_parameters']
     job_initiator = job_runtime_conf['initiator']
     model_id_base64 = base64_encode(job_parameters['model_id'])
     model_version_base64 = base64_encode(job_parameters['model_version'])
     roles = ','.join(job_runtime_conf['role'].keys())
     party_ids = ','.join([','.join([str(j) for j in i]) for i in job_runtime_conf['role'].values()])
     for role, partys in job_runtime_conf['role'].items():
         for party_id in partys:
             # save pipeline
             if not stop:
                 federated_api(job_id=job_id,
                               method='POST',
                               endpoint='/{}/schedule/{}/{}/{}/{}/{}/save/pipeline'.format(
                                   API_VERSION,
                                   job_id,
                                   role,
                                   party_id,
                                   model_id_base64,
                                   model_version_base64
                               ),
                               src_party_id=job_initiator['party_id'],
                               dest_party_id=party_id,
                               src_role=job_initiator['role'],
                               json_body={},
                               work_mode=job_parameters['work_mode'])
             # clean
             federated_api(job_id=job_id,
                           method='POST',
                           endpoint='/{}/schedule/{}/{}/{}/{}/{}/clean'.format(
                               API_VERSION,
                               job_id,
                               role,
                               party_id,
                               roles,
                               party_ids
                           ),
                           src_party_id=job_initiator['party_id'],
                           dest_party_id=party_id,
                           src_role=job_initiator['role'],
                           json_body={},
                           work_mode=job_parameters['work_mode'])
     schedule_logger(job_id, delete=True)
Пример #27
0
def start_session_stop(task):
    job_conf_dict = get_job_conf(task.f_job_id)
    runtime_conf = job_conf_dict['job_runtime_conf_path']
    process_cmd = [
        'python3', sys.modules[session_utils.SessionStop.__module__].__file__,
        '-j', '{}_{}_{}'.format(task.f_task_id, task.f_role,
                                task.f_party_id), '-w',
        str(runtime_conf.get('job_parameters').get('work_mode')), '-b',
        str(runtime_conf.get('job_parameters').get('backend', 0)), '-c',
        'stop' if task.f_status == TaskStatus.COMPLETE else 'kill'
    ]
    schedule_logger(task.f_job_id).info(
        'start run subprocess to stop component {} session'.format(
            task.f_component_name))
    task_dir = os.path.join(get_job_directory(job_id=task.f_job_id),
                            task.f_role, task.f_party_id,
                            task.f_component_name, 'session_stop')
    os.makedirs(task_dir, exist_ok=True)
    p = run_subprocess(config_dir=task_dir,
                       process_cmd=process_cmd,
                       log_dir=None)
Пример #28
0
def is_task_executor_process(task: Task, process: psutil.Process):
    """
    check the process if task executor or not by command
    :param task:
    :param process:
    :return:
    """
    # Todo: The same map should be used for run task command
    run_cmd_map = {
        3: "f_job_id",
        5: "f_component_name",
        7: "f_task_id",
        9: "f_role",
        11: "f_party_id"
    }
    try:
        cmdline = process.cmdline()
    except Exception as e:
        # Not sure whether the process is a task executor process, operations processing is required
        schedule_logger(task.f_job_id).warning(e)
        return False
    for i, k in run_cmd_map.items():
        if len(cmdline) > i and cmdline[i] == getattr(task, k):
            continue
        else:
            # todo: The logging level should be obtained first
            if len(cmdline) > i:
                schedule_logger(task.f_job_id).debug(cmdline[i])
                schedule_logger(task.f_job_id).debug(getattr(task, k))
            return False
    else:
        return True
Пример #29
0
 def save_metric_meta_remote(self,
                             metric_namespace: str,
                             metric_name: str,
                             metric_meta: MetricMeta,
                             job_level: bool = False):
     # TODO: In the next version will be moved to tracking api module on arch/api package
     schedule_logger(self.job_id).info(
         'request save job {} component {} on {} {} {} {} metric meta'.
         format(self.job_id, self.component_name, self.role, self.party_id,
                metric_namespace, metric_name))
     request_body = dict()
     request_body['metric_namespace'] = metric_namespace
     request_body['metric_name'] = metric_name
     request_body['metric_meta'] = serialize_b64(metric_meta, to_str=True)
     request_body['job_level'] = job_level
     response = api_utils.local_api(
         method='POST',
         endpoint='/{}/tracking/{}/{}/{}/{}/{}/metric_meta/save'.format(
             API_VERSION, self.job_id, self.component_name, self.task_id,
             self.role, self.party_id),
         json_body=request_body)
     return response['retcode'] == 0
Пример #30
0
 def finish_job(job_id, job_runtime_conf):
     job_parameters = job_runtime_conf['job_parameters']
     job_initiator = job_runtime_conf['initiator']
     model_id_base64 = base64_encode(job_parameters['model_id'])
     model_version_base64 = base64_encode(job_parameters['model_version'])
     for role, partys in job_runtime_conf['role'].items():
         for party_id in partys:
             # save pipeline
             federated_api(job_id=job_id,
                           method='POST',
                           endpoint='/{}/schedule/{}/{}/{}/{}/{}/save/pipeline'.format(
                               API_VERSION,
                               job_id,
                               role,
                               party_id,
                               model_id_base64,
                               model_version_base64
                           ),
                           src_party_id=job_initiator['party_id'],
                           dest_party_id=party_id,
                           src_role=job_initiator['role'],
                           json_body={},
                           work_mode=job_parameters['work_mode'])
             # clean
             """
             federated_api(job_id=job_id,
                           method='POST',
                           endpoint='/{}/schedule/{}/{}/{}/clean'.format(
                               API_VERSION,
                               job_id,
                               role,
                               party_id),
                           src_party_id=job_initiator['party_id'],
                           dest_party_id=party_id,
                           src_role=job_initiator['role'],
                           json_body={},
                           work_mode=job_parameters['work_mode'])
             """
     schedule_logger(job_id, delete=True)