示例#1
0
 def check_task_status(job_id, component, interval=1):
     task_id = job_utils.generate_task_id(
         job_id=job_id, component_name=component.get_name())
     while True:
         try:
             status_collect = set()
             parameters = component.get_role_parameters()
             for _role, _partys_parameters in parameters.items():
                 for _party_parameters in _partys_parameters:
                     _party_id = _party_parameters.get('local',
                                                       {}).get('party_id')
                     tasks = query_task(job_id=job_id,
                                        task_id=task_id,
                                        role=_role,
                                        party_id=_party_id)
                     if tasks:
                         task_status = tasks[0].f_status
                     else:
                         task_status = 'notRunning'
                     schedule_logger.info(
                         'job {} component {} run on {} {} status is {}'.
                         format(job_id, component.get_name(), _role,
                                _party_id, task_status))
                     status_collect.add(task_status)
             if 'failed' in status_collect:
                 return False
             elif len(status_collect) == 1 and 'success' in status_collect:
                 return True
             else:
                 time.sleep(interval)
         except Exception as e:
             schedule_logger.exception(e)
             return False
示例#2
0
 def start_task(job_id, component_name, task_id, role, party_id,
                task_config):
     schedule_logger.info('job {} {} {} {} task subprocess is ready'.format(
         job_id, component_name, role, party_id, task_config))
     task_process_start_status = False
     try:
         task_dir = os.path.join(job_utils.get_job_directory(job_id=job_id),
                                 role, party_id, component_name)
         os.makedirs(task_dir, exist_ok=True)
         task_config_path = os.path.join(task_dir, 'task_config.json')
         with open(task_config_path, 'w') as fw:
             json.dump(task_config, fw)
         process_cmd = [
             'python3', sys.modules[TaskExecutor.__module__].__file__, '-j',
             job_id, '-n', component_name, '-t', task_id, '-r', role, '-p',
             party_id, '-c', task_config_path
         ]
         task_log_dir = os.path.join(
             job_utils.get_job_log_directory(job_id=job_id), role, party_id,
             component_name)
         schedule_logger.info(
             'job {} {} {} {} task subprocess start'.format(
                 job_id, component_name, role, party_id, task_config))
         p = job_utils.run_subprocess(config_dir=task_dir,
                                      process_cmd=process_cmd,
                                      log_dir=task_log_dir)
         if p:
             task_process_start_status = True
     except Exception as e:
         schedule_logger.exception(e)
     finally:
         schedule_logger.info(
             'job {} component {} on {} {} start task subprocess {}'.format(
                 job_id, component_name, role, party_id,
                 'success' if task_process_start_status else 'failed'))
示例#3
0
 def clean_job(job_id, role, party_id):
     schedule_logger.info('job {} on {} {} start to clean'.format(job_id, role, party_id))
     tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id)
     for task in tasks:
         try:
             Tracking(job_id=job_id, role=role, party_id=party_id, task_id=task.f_task_id).clean_task()
             schedule_logger.info(
                 'job {} component {} on {} {} clean done'.format(job_id, task.f_component_name, role, party_id))
         except Exception as e:
             schedule_logger.info(
                 'job {} component {} on {} {} clean failed'.format(job_id, task.f_component_name, role, party_id))
             schedule_logger.exception(e)
     schedule_logger.info('job {} on {} {} clean done'.format(job_id, role, party_id))
示例#4
0
 def run(self):
     if not self.queue.is_ready():
         schedule_logger.error('queue is not ready')
         return False
     all_jobs = []
     while True:
         try:
             if len(all_jobs) == self.concurrent_num:
                 for future in as_completed(all_jobs):
                     all_jobs.remove(future)
                     break
             job_event = self.queue.get_event()
             schedule_logger.info('schedule job {}'.format(job_event))
             future = self.job_executor_pool.submit(DAGScheduler.handle_event, job_event)
             future.add_done_callback(DAGScheduler.get_result)
             all_jobs.append(future)
         except Exception as e:
             schedule_logger.exception(e)
示例#5
0
 def kill_job(job_id, role, party_id, job_initiator):
     schedule_logger.info('{} {} get kill job {} command'.format(role, party_id, job_id))
     tasks = job_utils.query_task(job_id=job_id, role=role, party_id=party_id)
     for task in tasks:
         kill_status = False
         try:
             kill_status = job_utils.kill_process(int(task.f_run_pid))
         except Exception as e:
             schedule_logger.exception(e)
         finally:
             schedule_logger.info(
                 'job {} component {} on {} {} process {} kill {}'.format(job_id, task.f_component_name, task.f_role,
                                                                          task.f_party_id, task.f_run_pid,
                                                                          'success' if kill_status else 'failed'))
         if task.f_status != TaskStatus.SUCCESS:
             task.f_status = TaskStatus.FAILED
         TaskExecutor.sync_task_status(job_id=job_id, component_name=task.f_component_name, task_id=task.f_task_id,
                                       role=role,
                                       party_id=party_id, initiator_party_id=job_initiator.get('party_id', None),
                                       task_info=task.to_json())
示例#6
0
    def run_task():
        task = Task()
        task.f_create_time = current_timestamp()
        try:
            parser = argparse.ArgumentParser()
            parser.add_argument('-j',
                                '--job_id',
                                required=True,
                                type=str,
                                help="job id")
            parser.add_argument('-n',
                                '--component_name',
                                required=True,
                                type=str,
                                help="component name")
            parser.add_argument('-t',
                                '--task_id',
                                required=True,
                                type=str,
                                help="task id")
            parser.add_argument('-r',
                                '--role',
                                required=True,
                                type=str,
                                help="role")
            parser.add_argument('-p',
                                '--party_id',
                                required=True,
                                type=str,
                                help="party id")
            parser.add_argument('-c',
                                '--config',
                                required=True,
                                type=str,
                                help="task config")
            parser.add_argument('--job_server', help="job server", type=str)
            args = parser.parse_args()
            schedule_logger.info('enter task process')
            schedule_logger.info(args)
            # init function args
            if args.job_server:
                RuntimeConfig.init_config(
                    HTTP_PORT=args.job_server.split(':')[1])
            job_id = args.job_id
            component_name = args.component_name
            task_id = args.task_id
            role = args.role
            party_id = int(args.party_id)
            task_config = file_utils.load_json_conf(args.config)
            job_parameters = task_config['job_parameters']
            job_initiator = task_config['job_initiator']
            job_args = task_config['job_args']
            task_input_dsl = task_config['input']
            task_output_dsl = task_config['output']
            parameters = task_config['parameters']
            module_name = task_config['module_name']
        except Exception as e:
            schedule_logger.exception(e)
            task.f_status = TaskStatus.FAILED
            return
        try:
            # init environment, process is shared globally
            RuntimeConfig.init_config(WORK_MODE=job_parameters['work_mode'])
            storage.init_storage(job_id=task_id,
                                 work_mode=RuntimeConfig.WORK_MODE)
            federation.init(job_id=task_id, runtime_conf=parameters)
            job_log_dir = os.path.join(
                job_utils.get_job_log_directory(job_id=job_id), role,
                str(party_id))
            task_log_dir = os.path.join(job_log_dir, component_name)
            log_utils.LoggerFactory.set_directory(directory=task_log_dir,
                                                  parent_log_dir=job_log_dir,
                                                  append_to_parent_log=True,
                                                  force=True)

            task.f_job_id = job_id
            task.f_component_name = component_name
            task.f_task_id = task_id
            task.f_role = role
            task.f_party_id = party_id
            task.f_operator = 'python_operator'
            tracker = Tracking(job_id=job_id,
                               role=role,
                               party_id=party_id,
                               component_name=component_name,
                               task_id=task_id,
                               model_id=job_parameters['model_id'],
                               model_version=job_parameters['model_version'],
                               module_name=module_name)
            task.f_start_time = current_timestamp()
            task.f_run_ip = get_lan_ip()
            task.f_run_pid = os.getpid()
            run_class_paths = parameters.get('CodePath').split('/')
            run_class_package = '.'.join(
                run_class_paths[:-2]) + '.' + run_class_paths[-2].replace(
                    '.py', '')
            run_class_name = run_class_paths[-1]
            task_run_args = TaskExecutor.get_task_run_args(
                job_id=job_id,
                role=role,
                party_id=party_id,
                job_parameters=job_parameters,
                job_args=job_args,
                input_dsl=task_input_dsl)
            run_object = getattr(importlib.import_module(run_class_package),
                                 run_class_name)()
            run_object.set_tracker(tracker=tracker)
            run_object.set_taskid(taskid=task_id)
            task.f_status = TaskStatus.RUNNING
            TaskExecutor.sync_task_status(job_id=job_id,
                                          component_name=component_name,
                                          task_id=task_id,
                                          role=role,
                                          party_id=party_id,
                                          initiator_party_id=job_initiator.get(
                                              'party_id', None),
                                          task_info=task.to_json())

            schedule_logger.info('run {} {} {} {} {} task'.format(
                job_id, component_name, task_id, role, party_id))
            schedule_logger.info(parameters)
            schedule_logger.info(task_input_dsl)
            run_object.run(parameters, task_run_args)
            if task_output_dsl:
                if task_output_dsl.get('data', []):
                    output_data = run_object.save_data()
                    tracker.save_output_data_table(
                        output_data,
                        task_output_dsl.get('data')[0])
                if task_output_dsl.get('model', []):
                    output_model = run_object.export_model()
                    # There is only one model output at the current dsl version.
                    tracker.save_output_model(output_model,
                                              task_output_dsl['model'][0])
            task.f_status = TaskStatus.SUCCESS
        except Exception as e:
            schedule_logger.exception(e)
            task.f_status = TaskStatus.FAILED
        finally:
            try:
                task.f_end_time = current_timestamp()
                task.f_elapsed = task.f_end_time - task.f_start_time
                task.f_update_time = current_timestamp()
                TaskExecutor.sync_task_status(
                    job_id=job_id,
                    component_name=component_name,
                    task_id=task_id,
                    role=role,
                    party_id=party_id,
                    initiator_party_id=job_initiator.get('party_id', None),
                    task_info=task.to_json())
            except Exception as e:
                schedule_logger.exception(e)
        schedule_logger.info('finish {} {} {} {} {} {} task'.format(
            job_id, component_name, task_id, role, party_id, task.f_status))
        print('finish {} {} {} {} {} {} task'.format(job_id, component_name,
                                                     task_id, role, party_id,
                                                     task.f_status))
示例#7
0
 def handle_event(job_event):
     try:
         return TaskScheduler.run_job(**job_event)
     except Exception as e:
         schedule_logger.exception(e)
         return False