예제 #1
0
def update_job_progress(job_id, dag, current_task_id):
    component_count = len(dag.get_dependency()['component_list'])
    success_count = success_task_count(job_id=job_id)
    job = Job()
    job.f_progress = float(success_count) / component_count * 100
    job.f_update_time = current_timestamp()
    job.f_current_tasks = json_dumps([current_task_id])
    return job
예제 #2
0
    def submit_job(job_data):
        job_id = generate_job_id()
        schedule_logger.info('submit job, job_id {}, body {}'.format(job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_utils.check_pipeline_job_runtime_conf(job_runtime_conf)
        job_parameters = job_runtime_conf['job_parameters']
        job_initiator = job_runtime_conf['initiator']
        job_type = job_parameters.get('job_type', '')
        if job_type != 'predict':
            # generate job model info
            job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model'])
            job_parameters['model_version'] = job_id
            train_runtime_conf = {}
        else:
            detect_utils.check_config(job_parameters, ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'],
                                   model_id=job_parameters['model_id'], model_version=job_parameters['model_version'])
            pipeline_model = job_tracker.get_output_model('pipeline')
            job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf)
        job_dsl_path, job_runtime_conf_path = save_job_conf(job_id=job_id,
                                                            job_dsl=job_dsl,
                                                            job_runtime_conf=job_runtime_conf)

        job = Job()
        job.f_job_id = job_id
        job.f_roles = json_dumps(job_runtime_conf['role'])
        job.f_work_mode = job_parameters['work_mode']
        job.f_initiator_party_id = job_initiator['party_id']
        job.f_dsl = json_dumps(job_dsl)
        job.f_runtime_conf = json_dumps(job_runtime_conf)
        job.f_train_runtime_conf = json_dumps(train_runtime_conf)
        job.f_run_ip = ''
        job.f_status = JobStatus.WAITING
        job.f_progress = 0
        job.f_create_time = current_timestamp()

        # save job info
        TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator)

        # push into queue
        RuntimeConfig.JOB_QUEUE.put_event({
            'job_id': job_id,
            "initiator_role": job_initiator['role'],
            "initiator_party_id": job_initiator['party_id']
        }
        )
        schedule_logger.info(
            'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id']))
        board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id'])
        return job_id, job_dsl_path, job_runtime_conf_path, {'model_id': job_parameters['model_id'],
                                                             'model_version': job_parameters[
                                                                 'model_version']}, board_url
예제 #3
0
    def schedule_running_job(cls, job: Job, force_sync_status=False):
        schedule_logger(job.f_job_id).info(f"scheduling running job")

        dsl_parser = schedule_utils.get_job_dsl_parser(
            dsl=job.f_dsl,
            runtime_conf=job.f_runtime_conf_on_party,
            train_runtime_conf=job.f_train_runtime_conf)
        task_scheduling_status_code, auto_rerun_tasks, tasks = TaskScheduler.schedule(
            job=job, dsl_parser=dsl_parser, canceled=job.f_cancel_signal)
        tasks_status = dict([(task.f_component_name, task.f_status)
                             for task in tasks])
        new_job_status = cls.calculate_job_status(
            task_scheduling_status_code=task_scheduling_status_code,
            tasks_status=tasks_status.values())
        if new_job_status == JobStatus.WAITING and job.f_cancel_signal:
            new_job_status = JobStatus.CANCELED
        total, finished_count = cls.calculate_job_progress(
            tasks_status=tasks_status)
        new_progress = float(finished_count) / total * 100
        schedule_logger(job.f_job_id).info(
            f"job status is {new_job_status}, calculate by task status list: {tasks_status}"
        )
        if new_job_status != job.f_status or new_progress != job.f_progress:
            # Make sure to update separately, because these two fields update with anti-weight logic
            if int(new_progress) - job.f_progress > 0:
                job.f_progress = new_progress
                FederatedScheduler.sync_job(job=job,
                                            update_fields=["progress"])
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["progress"])
            if new_job_status != job.f_status:
                job.f_status = new_job_status
                if EndStatus.contains(job.f_status):
                    FederatedScheduler.save_pipelined_model(job=job)
                FederatedScheduler.sync_job_status(job=job)
                cls.update_job_on_initiator(initiator_job=job,
                                            update_fields=["status"])
        if EndStatus.contains(job.f_status):
            cls.finish(job=job, end_status=job.f_status)
        if auto_rerun_tasks:
            schedule_logger(job.f_job_id).info("job have auto rerun tasks")
            cls.set_job_rerun(job_id=job.f_job_id,
                              initiator_role=job.f_initiator_role,
                              initiator_party_id=job.f_initiator_party_id,
                              tasks=auto_rerun_tasks,
                              auto=True)
        if force_sync_status:
            FederatedScheduler.sync_job_status(job=job)
        schedule_logger(job.f_job_id).info("finish scheduling running job")
예제 #4
0
    def run_job(job_id, initiator_role, initiator_party_id):
        job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(
            job_id=job_id, role=initiator_role, party_id=initiator_party_id)
        job_parameters = job_runtime_conf.get('job_parameters', {})
        job_initiator = job_runtime_conf.get('initiator', {})
        dag = get_job_dsl_parser(dsl=job_dsl,
                                 runtime_conf=job_runtime_conf,
                                 train_runtime_conf=train_runtime_conf)
        job_args = dag.get_args_input()
        if not job_initiator:
            return False
        storage.init_storage(job_id=job_id, work_mode=RuntimeConfig.WORK_MODE)
        job = Job()
        job.f_job_id = job_id
        job.f_start_time = current_timestamp()
        job.f_status = JobStatus.RUNNING
        job.f_update_time = current_timestamp()
        TaskScheduler.sync_job_status(
            job_id=job_id,
            roles=job_runtime_conf['role'],
            work_mode=job_parameters['work_mode'],
            initiator_party_id=job_initiator['party_id'],
            job_info=job.to_json())

        top_level_task_status = set()
        components = dag.get_next_components(None)
        schedule_logger.info('job {} root components is {}'.format(
            job.f_job_id, [component.get_name() for component in components],
            None))
        for component in components:
            try:
                # run a component as task
                run_status = TaskScheduler.run_component(
                    job_id, job_runtime_conf, job_parameters, job_initiator,
                    job_args, dag, component)
            except Exception as e:
                schedule_logger.info(e)
                run_status = False
            top_level_task_status.add(run_status)
            if not run_status:
                break
        if len(top_level_task_status) == 2:
            job.f_status = JobStatus.PARTIAL
        elif True in top_level_task_status:
            job.f_status = JobStatus.SUCCESS
        else:
            job.f_status = JobStatus.FAILED
        job.f_end_time = current_timestamp()
        job.f_elapsed = job.f_end_time - job.f_start_time
        if job.f_status == JobStatus.SUCCESS:
            job.f_progress = 100
        job.f_update_time = current_timestamp()
        TaskScheduler.sync_job_status(
            job_id=job_id,
            roles=job_runtime_conf['role'],
            work_mode=job_parameters['work_mode'],
            initiator_party_id=job_initiator['party_id'],
            job_info=job.to_json())
        TaskScheduler.finish_job(job_id=job_id,
                                 job_runtime_conf=job_runtime_conf)
        schedule_logger.info('job {} finished, status is {}'.format(
            job.f_job_id, job.f_status))
예제 #5
0
    def run_job(job_id, initiator_role, initiator_party_id):
        job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration(job_id=job_id,
                                                                                        role=initiator_role,
                                                                                        party_id=initiator_party_id)
        job_parameters = job_runtime_conf.get('job_parameters', {})
        job_initiator = job_runtime_conf.get('initiator', {})
        dag = get_job_dsl_parser(dsl=job_dsl,
                                 runtime_conf=job_runtime_conf,
                                 train_runtime_conf=train_runtime_conf)
        job_args = dag.get_args_input()
        if not job_initiator:
            return False
        timeout = job_utils.get_timeout(job_id, job_parameters.get("timeout", None), job_runtime_conf, job_dsl)
        t = Timer(timeout, TaskScheduler.job_handler, [job_id])
        t.start()

        job = Job()
        job.f_job_id = job_id
        job.f_start_time = current_timestamp()
        job.f_status = JobStatus.RUNNING
        job.f_update_time = current_timestamp()
        TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'],
                                      work_mode=job_parameters['work_mode'],
                                      initiator_party_id=job_initiator['party_id'],
                                      initiator_role=job_initiator['role'],
                                      job_info=job.to_json())

        top_level_task_status = set()
        components = dag.get_next_components(None)
        schedule_logger(job_id).info(
            'job {} root components is {}'.format(job.f_job_id, [component.get_name() for component in components],
                                                  None))
        for component in components:
            try:
                # run a component as task
                run_status = TaskScheduler.run_component(job_id, job_runtime_conf, job_parameters, job_initiator,
                                                         job_args, dag,
                                                         component)
            except Exception as e:
                schedule_logger(job_id).exception(e)
                run_status = False
            top_level_task_status.add(run_status)
            if not run_status:
                break
        if len(top_level_task_status) == 2:
            job.f_status = JobStatus.FAILED
        elif True in top_level_task_status:
            job.f_status = JobStatus.COMPLETE
        else:
            job.f_status = JobStatus.FAILED
        job.f_end_time = current_timestamp()
        job.f_elapsed = job.f_end_time - job.f_start_time
        if job.f_status == JobStatus.COMPLETE:
            job.f_progress = 100
        job.f_update_time = current_timestamp()
        try:
            TaskScheduler.finish_job(job_id=job_id, job_runtime_conf=job_runtime_conf)
        except Exception as e:
            schedule_logger(job_id).exception(e)
            job.f_status = JobStatus.FAILED

        if job.f_status == JobStatus.FAILED:
            TaskScheduler.stop(job_id=job_id, end_status=JobStatus.FAILED)

        try:
            TaskScheduler.sync_job_status(job_id=job_id, roles=job_runtime_conf['role'],
                                          work_mode=job_parameters['work_mode'],
                                          initiator_party_id=job_initiator['party_id'],
                                          initiator_role=job_initiator['role'],
                                          job_info=job.to_json())
        except Exception as e:
            schedule_logger(job_id).exception(e)
            schedule_logger(job_id).warning('job {} sync status failed'.format(job.f_job_id))

        schedule_logger(job_id).info('job {} finished, status is {}'.format(job.f_job_id, job.f_status))
        t.cancel()
예제 #6
0
    def submit_job(job_data):
        job_id = generate_job_id()
        schedule_logger(job_id).info('submit job, job_id {}, body {}'.format(job_id, job_data))
        job_dsl = job_data.get('job_dsl', {})
        job_runtime_conf = job_data.get('job_runtime_conf', {})
        job_utils.check_pipeline_job_runtime_conf(job_runtime_conf)
        job_parameters = job_runtime_conf['job_parameters']
        job_initiator = job_runtime_conf['initiator']
        job_type = job_parameters.get('job_type', '')
        if job_type != 'predict':
            # generate job model info
            job_parameters['model_id'] = '#'.join([dtable_utils.all_party_key(job_runtime_conf['role']), 'model'])
            job_parameters['model_version'] = job_id
            train_runtime_conf = {}
        else:
            detect_utils.check_config(job_parameters, ['model_id', 'model_version'])
            # get inference dsl from pipeline model as job dsl
            job_tracker = Tracking(job_id=job_id, role=job_initiator['role'], party_id=job_initiator['party_id'],
                                   model_id=job_parameters['model_id'], model_version=job_parameters['model_version'])
            pipeline_model = job_tracker.get_output_model('pipeline')
            job_dsl = json_loads(pipeline_model['Pipeline'].inference_dsl)
            train_runtime_conf = json_loads(pipeline_model['Pipeline'].train_runtime_conf)
        path_dict = save_job_conf(job_id=job_id,
                                  job_dsl=job_dsl,
                                  job_runtime_conf=job_runtime_conf,
                                  train_runtime_conf=train_runtime_conf,
                                  pipeline_dsl=None)

        job = Job()
        job.f_job_id = job_id
        job.f_roles = json_dumps(job_runtime_conf['role'])
        job.f_work_mode = job_parameters['work_mode']
        job.f_initiator_party_id = job_initiator['party_id']
        job.f_dsl = json_dumps(job_dsl)
        job.f_runtime_conf = json_dumps(job_runtime_conf)
        job.f_train_runtime_conf = json_dumps(train_runtime_conf)
        job.f_run_ip = ''
        job.f_status = JobStatus.WAITING
        job.f_progress = 0
        job.f_create_time = current_timestamp()

        initiator_role = job_initiator['role']
        initiator_party_id = job_initiator['party_id']
        if initiator_party_id not in job_runtime_conf['role'][initiator_role]:
            schedule_logger(job_id).info("initiator party id error:{}".format(initiator_party_id))
            raise Exception("initiator party id error {}".format(initiator_party_id))

        get_job_dsl_parser(dsl=job_dsl,
                           runtime_conf=job_runtime_conf,
                           train_runtime_conf=train_runtime_conf)

        TaskScheduler.distribute_job(job=job, roles=job_runtime_conf['role'], job_initiator=job_initiator)

        # push into queue
        job_event = job_utils.job_event(job_id, initiator_role,  initiator_party_id)
        try:
            RuntimeConfig.JOB_QUEUE.put_event(job_event)
        except Exception as e:
            raise Exception('push job into queue failed')

        schedule_logger(job_id).info(
            'submit job successfully, job id is {}, model id is {}'.format(job.f_job_id, job_parameters['model_id']))
        board_url = BOARD_DASHBOARD_URL.format(job_id, job_initiator['role'], job_initiator['party_id'])
        logs_directory = get_job_log_directory(job_id)
        return job_id, path_dict['job_dsl_path'], path_dict['job_runtime_conf_path'], logs_directory, \
               {'model_id': job_parameters['model_id'],'model_version': job_parameters['model_version']}, board_url