def resource_for_task(cls, task_info, operation_type): cores_per_task, memory_per_task = cls.calculate_task_resource( task_info=task_info) if cores_per_task or memory_per_task: filters, updates = cls.update_resource_sql( resource_model=Job, cores=cores_per_task, memory=memory_per_task, operation_type=operation_type, ) filters.append(Job.f_job_id == task_info["job_id"]) filters.append(Job.f_role == task_info["role"]) filters.append(Job.f_party_id == task_info["party_id"]) filters.append(Job.f_resource_in_use == True) operate = Job.update(updates).where(*filters) operate_status = operate.execute() > 0 else: operate_status = True if operate_status: schedule_logger(job_id=task_info["job_id"]).info( "task {} {} {} resource successfully".format( task_info["task_id"], task_info["task_version"], operation_type)) else: schedule_logger(job_id=task_info["job_id"]).warning( "task {} {} {} resource failed".format( task_info["task_id"], task_info["task_version"], operation_type)) return operate_status
def kill_task_executor_process(task: Task, only_child=False): try: if not task.f_run_pid: schedule_logger(task.f_job_id).info("job {} task {} {} {} with {} party status no process pid".format( task.f_job_id, task.f_task_id, task.f_role, task.f_party_id, task.f_party_status)) return KillProcessStatusCode.NOT_FOUND pid = int(task.f_run_pid) schedule_logger(task.f_job_id).info("try to stop job {} task {} {} {} with {} party status process pid:{}".format( task.f_job_id, task.f_task_id, task.f_role, task.f_party_id, task.f_party_status, pid)) if not check_job_process(pid): schedule_logger(task.f_job_id).info("can not found job {} task {} {} {} with {} party status process pid:{}".format( task.f_job_id, task.f_task_id, task.f_role, task.f_party_id, task.f_party_status, pid)) return KillProcessStatusCode.NOT_FOUND p = psutil.Process(int(pid)) if not is_task_executor_process(task=task, process=p): schedule_logger(task.f_job_id).warning("this pid {} is not job {} task {} {} {} executor".format( pid, task.f_job_id, task.f_task_id, task.f_role, task.f_party_id)) return KillProcessStatusCode.ERROR_PID for child in p.children(recursive=True): if check_job_process(child.pid) and is_task_executor_process(task=task, process=child): child.kill() if not only_child: if check_job_process(p.pid) and is_task_executor_process(task=task, process=p): p.kill() schedule_logger(task.f_job_id).info("successfully stop job {} task {} {} {} process pid:{}".format( task.f_job_id, task.f_task_id, task.f_role, task.f_party_id, pid)) return KillProcessStatusCode.KILLED except Exception as e: raise e
def report_task_to_initiator(cls, task: Task): """ :param task: :return: """ if task.f_role != task.f_initiator_role and task.f_party_id != task.f_initiator_party_id: exception = None for t in range(DEFAULT_FEDERATED_COMMAND_TRYS): try: response = federated_api( job_id=task.f_job_id, method='POST', endpoint='/initiator/{}/{}/{}/{}/{}/{}/report'.format( task.f_job_id, task.f_component_name, task.f_task_id, task.f_task_version, task.f_role, task.f_party_id), src_party_id=task.f_party_id, dest_party_id=task.f_initiator_party_id, src_role=task.f_role, json_body=task.to_human_model_dict( only_primary_with=cls.REPORT_TO_INITIATOR_FIELDS), federated_mode=task.f_federated_mode) except Exception as e: exception = e continue if response["retcode"] != RetCode.SUCCESS: exception = Exception(response["retmsg"]) else: return True else: schedule_logger(job_id=task.f_job_id).error( f"report task to initiator error: {exception}") return False else: return False
def local_api(job_id, method, endpoint, json_body, api_version=API_VERSION, try_times=3): endpoint = f"/{api_version}{endpoint}" exception = None for t in range(try_times): try: url = "http://{}:{}{}".format(RuntimeConfig.JOB_SERVER_HOST, RuntimeConfig.HTTP_PORT, endpoint) audit_logger(job_id).info('local api request: {}'.format(url)) action = getattr(requests, method.lower(), None) http_response = action(url=url, data=json_dumps(json_body), headers=HEADERS) audit_logger(job_id).info(http_response.text) response = http_response.json() audit_logger(job_id).info('local api response: {} {}'.format( endpoint, response)) return response except Exception as e: schedule_logger(job_id).exception(e) exception = e else: raise Exception('local request error: {}'.format(exception))
def run_subprocess(job_id, config_dir, process_cmd, log_dir=None): schedule_logger(job_id=job_id).info('start process command: {}'.format( ' '.join(process_cmd))) os.makedirs(config_dir, exist_ok=True) if log_dir: os.makedirs(log_dir, exist_ok=True) std_log = open(os.path.join(log_dir if log_dir else config_dir, 'std.log'), 'w') pid_path = os.path.join(config_dir, 'pid') if os.name == 'nt': startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW startupinfo.wShowWindow = subprocess.SW_HIDE else: startupinfo = None p = subprocess.Popen(process_cmd, stdout=std_log, stderr=std_log, startupinfo=startupinfo) with open(pid_path, 'w') as f: f.truncate() f.write(str(p.pid) + "\n") f.flush() schedule_logger(job_id=job_id).info( 'start process command: {} successfully, pid is {}'.format( ' '.join(process_cmd), p.pid)) return p
def start_session_stop(task): job_parameters = RunParameters(**get_job_parameters( job_id=task.f_job_id, role=task.f_role, party_id=task.f_party_id)) computing_session_id = generate_session_id(task.f_task_id, task.f_task_version, task.f_role, task.f_party_id) if task.f_status != TaskStatus.WAITING: schedule_logger(task.f_job_id).info( f'start run subprocess to stop task session {computing_session_id}' ) else: schedule_logger(task.f_job_id).info( f'task is waiting, pass stop session {computing_session_id}') return task_dir = os.path.join(get_job_directory(job_id=task.f_job_id), task.f_role, task.f_party_id, task.f_component_name, 'session_stop') os.makedirs(task_dir, exist_ok=True) process_cmd = [ 'python3', sys.modules[session_utils.SessionStop.__module__].__file__, '-j', computing_session_id, '--computing', job_parameters.computing_engine, '--federation', job_parameters.federation_engine, '--storage', job_parameters.storage_engine, '-c', 'stop' if task.f_status == JobStatus.SUCCESS else 'kill' ] p = run_subprocess(job_id=task.f_job_id, config_dir=task_dir, process_cmd=process_cmd, log_dir=None)
def collect_task_of_all_party(cls, job, initiator_task, set_status=None): tasks_on_all_party = JobSaver.query_task(task_id=initiator_task.f_task_id, task_version=initiator_task.f_task_version) tasks_status_on_all = set([task.f_status for task in tasks_on_all_party]) if not len(tasks_status_on_all) > 1 and not TaskStatus.RUNNING in tasks_status_on_all: return status, federated_response = FederatedScheduler.collect_task(job=job, task=initiator_task) if status != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job.f_job_id).warning(f"collect task {initiator_task.f_task_id} {initiator_task.f_task_version} on {initiator_task.f_role} {initiator_task.f_party_id} failed") for _role in federated_response.keys(): for _party_id, party_response in federated_response[_role].items(): if party_response["retcode"] == RetCode.SUCCESS: JobSaver.update_task_status(task_info=party_response["data"]) JobSaver.update_task(task_info=party_response["data"]) elif party_response["retcode"] == RetCode.FEDERATED_ERROR and set_status: tmp_task_info = { "job_id": initiator_task.f_job_id, "task_id": initiator_task.f_task_id, "task_version": initiator_task.f_task_version, "role": _role, "party_id": _party_id, "party_status": TaskStatus.RUNNING } JobSaver.update_task_status(task_info=tmp_task_info) tmp_task_info["party_status"] = set_status JobSaver.update_task_status(task_info=tmp_task_info)
def insert_metrics_into_db(self, metric_namespace: str, metric_name: str, data_type: int, kv, job_level=False): try: tracking_metric = self.get_dynamic_db_model( TrackingMetric, self.job_id)() tracking_metric.f_job_id = self.job_id tracking_metric.f_component_name = ( self.component_name if not job_level else job_utils.job_virtual_component_name()) tracking_metric.f_task_id = self.task_id tracking_metric.f_task_version = self.task_version tracking_metric.f_role = self.role tracking_metric.f_party_id = self.party_id tracking_metric.f_metric_namespace = metric_namespace tracking_metric.f_metric_name = metric_name tracking_metric.f_type = data_type default_db_source = tracking_metric.to_json() tracking_metric_data_source = [] for k, v in kv: db_source = default_db_source.copy() db_source['f_key'] = serialize_b64(k) db_source['f_value'] = serialize_b64(v) db_source['f_create_time'] = current_timestamp() tracking_metric_data_source.append(db_source) self.bulk_insert_into_db( self.get_dynamic_db_model(TrackingMetric, self.job_id), tracking_metric_data_source) except Exception as e: schedule_logger(self.job_id).exception( "An exception where inserted metric {} of metric namespace: {} to database:\n{}" .format(metric_name, metric_namespace, e))
def get_output_data_table(self, output_data_infos, tracker_client=None): """ Get component output data table, will run in the task executor process :param output_data_infos: :return: """ output_tables_meta = {} if output_data_infos: for output_data_info in output_data_infos: schedule_logger(self.job_id).info( "Get task {} {} output table {} {}".format( output_data_info.f_task_id, output_data_info.f_task_version, output_data_info.f_table_namespace, output_data_info.f_table_name)) if not tracker_client: data_table_meta = storage.StorageTableMeta( name=output_data_info.f_table_name, namespace=output_data_info.f_table_namespace) else: data_table_meta = tracker_client.get_table_meta( output_data_info.f_table_name, output_data_info.f_table_namespace) output_tables_meta[ output_data_info.f_data_name] = data_table_meta return output_tables_meta
def query_output_data_infos(cls, **kwargs): try: tracking_output_data_info_model = cls.get_dynamic_db_model( TrackingOutputDataInfo, kwargs.get("job_id")) filters = [] for f_n, f_v in kwargs.items(): attr_name = 'f_%s' % f_n if hasattr(tracking_output_data_info_model, attr_name): filters.append( operator.attrgetter('f_%s' % f_n)( tracking_output_data_info_model) == f_v) if filters: output_data_infos_tmp = tracking_output_data_info_model.select( ).where(*filters) else: output_data_infos_tmp = tracking_output_data_info_model.select( ) output_data_infos_group = {} # Only the latest version of the task output data is retrieved for output_data_info in output_data_infos_tmp: group_key = cls.get_output_data_group_key( output_data_info.f_task_id, output_data_info.f_data_name) if group_key not in output_data_infos_group: output_data_infos_group[group_key] = output_data_info elif output_data_info.f_task_version > output_data_infos_group[ group_key].f_task_version: output_data_infos_group[group_key] = output_data_info return output_data_infos_group.values() except Exception as e: schedule_logger(kwargs.get("job_id")).exception(e) return []
def insert_summary_into_db(self, summary_data: dict): try: summary_model = self.get_dynamic_db_model(ComponentSummary, self.job_id) DB.create_tables([summary_model]) summary_obj = summary_model.get_or_none( summary_model.f_job_id == self.job_id, summary_model.f_component_name == self.component_name, summary_model.f_role == self.role, summary_model.f_party_id == self.party_id, summary_model.f_task_id == self.task_id, summary_model.f_task_version == self.task_version) if summary_obj: summary_obj.f_summary = serialize_b64(summary_data, to_str=True) summary_obj.f_update_time = current_timestamp() summary_obj.save() else: self.get_dynamic_db_model( ComponentSummary, self.job_id).create(f_job_id=self.job_id, f_component_name=self.component_name, f_role=self.role, f_party_id=self.party_id, f_task_id=self.task_id, f_task_version=self.task_version, f_summary=serialize_b64(summary_data, to_str=True), f_create_time=current_timestamp()) except Exception as e: schedule_logger(self.job_id).exception( "An exception where querying summary job id: {} " "component name: {} to database:\n{}".format( self.job_id, self.component_name, e))
def read_metrics_from_db(self, metric_namespace: str, metric_name: str, data_type, job_level=False): metrics = [] try: tracking_metric_model = self.get_dynamic_db_model( TrackingMetric, self.job_id) tracking_metrics = tracking_metric_model.select( tracking_metric_model.f_key, tracking_metric_model.f_value).where( tracking_metric_model.f_job_id == self.job_id, tracking_metric_model.f_component_name == ( self.component_name if not job_level else job_utils.job_virtual_component_name()), tracking_metric_model.f_role == self.role, tracking_metric_model.f_party_id == self.party_id, tracking_metric_model.f_metric_namespace == metric_namespace, tracking_metric_model.f_metric_name == metric_name, tracking_metric_model.f_type == data_type) for tracking_metric in tracking_metrics: yield deserialize_b64(tracking_metric.f_key), deserialize_b64( tracking_metric.f_value) except Exception as e: schedule_logger(self.job_id).exception(e) raise e return metrics
def save_pipelined_model(cls, job_id, role, party_id): schedule_logger(job_id).info( 'job {} on {} {} start to save pipeline'.format( job_id, role, party_id)) job_dsl, job_runtime_conf, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=role, party_id=party_id) job_parameters = job_runtime_conf.get('job_parameters', {}) model_id = job_parameters['model_id'] model_version = job_parameters['model_version'] job_type = job_parameters.get('job_type', '') if job_type == 'predict': return dag = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) predict_dsl = dag.get_predict_dsl(role=role) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_dsl, byte=True) pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version) tracker.save_pipelined_model(pipelined_buffer_object=pipeline) if role != 'local': tracker.save_machine_learning_model_info() schedule_logger(job_id).info( 'job {} on {} {} save pipeline successfully'.format( job_id, role, party_id))
def federated_coordination_on_grpc(job_id, method, host, port, endpoint, src_party_id, src_role, dest_party_id, json_body, api_version=API_VERSION, overall_timeout=DEFAULT_REMOTE_REQUEST_TIMEOUT, try_times=3): endpoint = f"/{api_version}{endpoint}" json_body['src_role'] = src_role json_body['src_party_id'] = src_party_id if CHECK_NODES_IDENTITY: get_node_identity(json_body, src_party_id) _packet = wrap_grpc_packet(json_body, method, endpoint, src_party_id, dest_party_id, job_id, overall_timeout=overall_timeout) _routing_metadata = gen_routing_metadata(src_party_id=src_party_id, dest_party_id=dest_party_id) exception = None for t in range(try_times): try: channel, stub = get_command_federation_channel(host, port) _return, _call = stub.unaryCall.with_call(_packet, metadata=_routing_metadata, timeout=(overall_timeout/1000)) audit_logger(job_id).info("grpc api response: {}".format(_return)) channel.close() response = json_loads(_return.body.value) return response except Exception as e: exception = e schedule_logger(job_id).warning(f"remote request {endpoint} error, sleep and try again") time.sleep(2 * (t+1)) else: tips = 'Please check rollSite and fateflow network connectivity' """ if 'Error received from peer' in str(exception): tips = 'Please check if the fate flow server of the other party is started. ' if 'failed to connect to all addresses' in str(exception): tips = 'Please check whether the rollsite service(port: 9370) is started. ' """ raise Exception('{}rpc request error: {}'.format(tips, exception))
def schedule_ready_job(cls, job): job_id, initiator_role, initiator_party_id, = job.f_job_id, job.f_initiator_role, job.f_initiator_party_id update_status = cls.ready_signal(job_id=job_id, set_or_reset=False, ready_timeout_ttl=60 * 1000) schedule_logger(job_id).info( f"reset job {job_id} ready signal {update_status}")
def schedule_ready_jobs(cls, event): job_id, initiator_role, initiator_party_id, = event.f_job_id, event.f_initiator_role, event.f_initiator_party_id, update_status = JobQueue.update_event(job_id=job_id, initiator_role=initiator_role, initiator_party_id=initiator_party_id, job_status=JobStatus.WAITING, ttl=5*60*1000) schedule_logger(job_id).info(f"update job {job_id} ready status to waiting {update_status}")
def collect_task_of_all_party(cls, job, task): status, federated_response = FederatedScheduler.collect_task(job=job, task=task) if status != FederatedSchedulingStatusCode.SUCCESS: schedule_logger(job_id=job.f_job_id).warning(f"collect task {task.f_task_id} {task.f_task_version} on {task.f_role} {task.f_party_id} failed") return for _role in federated_response.keys(): for _party_id, party_response in federated_response[_role].items(): JobSaver.update_task_status(task_info=party_response["data"]) JobSaver.update_task(task_info=party_response["data"])
def check_job_is_timeout(job: Job): job_parameters = job.f_runtime_conf_on_party["job_parameters"] timeout = job_parameters.get("timeout", JOB_DEFAULT_TIMEOUT) now_time = current_timestamp() running_time = (now_time - job.f_create_time)/1000 if running_time > timeout: schedule_logger(job_id=job.f_job_id).info('job {} run time {}s timeout'.format(job.f_job_id, running_time)) return True else: return False
def get_remaining_resource(cls, resource_model: typing.Union[EngineRegistry, Job], filters): remaining_cores, remaining_memory = None, None try: objs = resource_model.select(resource_model.f_remaining_cores, resource_model.f_remaining_memory).where( *filters) if objs: remaining_cores, remaining_memory = objs[0].f_remaining_cores, objs[0].f_remaining_memory except Exception as e: schedule_logger().exception(e) finally: return remaining_cores, remaining_memory
def job_command(cls, job, command, command_body=None, dest_only_initiator=False, specific_dest=None, order_federated=False): federated_response = {} job_parameters = job.f_runtime_conf_on_party["job_parameters"] if dest_only_initiator: dest_partys = [(job.f_initiator_role, [job.f_initiator_party_id])] api_type = "initiator" elif specific_dest: dest_partys = specific_dest.items() api_type = "party" else: dest_partys = job.f_roles.items() api_type = "party" if order_federated: dest_partys = schedule_utils.federated_order_reset( dest_partys, scheduler_partys_info=[(job.f_initiator_role, job.f_initiator_party_id)]) for dest_role, dest_party_ids in dest_partys: federated_response[dest_role] = {} for dest_party_id in dest_party_ids: try: response = federated_api( job_id=job.f_job_id, method='POST', endpoint='/{}/{}/{}/{}/{}'.format( api_type, job.f_job_id, dest_role, dest_party_id, command), src_party_id=job.f_initiator_party_id, dest_party_id=dest_party_id, src_role=job.f_initiator_role, json_body=command_body if command_body else {}, federated_mode=job_parameters["federated_mode"]) federated_response[dest_role][dest_party_id] = response except Exception as e: schedule_logger(job_id=job.f_job_id).exception(e) federated_response[dest_role][dest_party_id] = { "retcode": RetCode.FEDERATED_ERROR, "retmsg": "Federated schedule error, {}".format(e) } if federated_response[dest_role][dest_party_id]["retcode"]: schedule_logger(job_id=job.f_job_id).warning( "an error occurred while {} the job to role {} party {}: \n{}" .format( command, dest_role, dest_party_id, federated_response[dest_role][dest_party_id] ["retmsg"])) return cls.return_federated_response( federated_response=federated_response)
def federated_task_status(cls, job_id, task_id, task_version): tasks_on_all_party = JobSaver.query_task(task_id=task_id, task_version=task_version) tasks_party_status = [ task.f_party_status for task in tasks_on_all_party ] status = cls.calculate_multi_party_task_status(tasks_party_status) schedule_logger(job_id=job_id).info( "job {} task {} {} status is {}, calculate by task party status list: {}" .format(job_id, task_id, task_version, status, tasks_party_status)) return status
def save_metric_meta(self, metric_namespace: str, metric_name: str, metric_meta: MetricMeta, job_level: bool = False): schedule_logger(self.job_id).info( 'save job {} component {} on {} {} {} {} metric meta'.format( self.job_id, self.component_name, self.role, self.party_id, metric_namespace, metric_name)) self.insert_metrics_into_db(metric_namespace, metric_name, 0, metric_meta.to_dict().items(), job_level)
def finish(cls, job, end_status): schedule_logger(job_id=job.f_job_id).info( "Job {} finished with {}, do something...".format( job.f_job_id, end_status)) cls.stop_job(job_id=job.f_job_id, role=job.f_initiator_role, party_id=job.f_initiator_party_id, stop_status=end_status) FederatedScheduler.clean_job(job=job) schedule_logger(job_id=job.f_job_id).info( "Job {} finished with {}, done".format(job.f_job_id, end_status))
def save_pipelined_model(cls, job_id, role, party_id): schedule_logger(job_id).info( 'job {} on {} {} start to save pipeline'.format( job_id, role, party_id)) job_dsl, job_runtime_conf, runtime_conf_on_party, train_runtime_conf = job_utils.get_job_configuration( job_id=job_id, role=role, party_id=party_id) job_parameters = runtime_conf_on_party.get('job_parameters', {}) if role in job_parameters.get("assistant_role", []): return model_id = job_parameters['model_id'] model_version = job_parameters['model_version'] job_type = job_parameters.get('job_type', '') work_mode = job_parameters['work_mode'] roles = runtime_conf_on_party['role'] initiator_role = runtime_conf_on_party['initiator']['role'] initiator_party_id = runtime_conf_on_party['initiator']['party_id'] if job_type == 'predict': return dag = schedule_utils.get_job_dsl_parser( dsl=job_dsl, runtime_conf=job_runtime_conf, train_runtime_conf=train_runtime_conf) predict_dsl = dag.get_predict_dsl(role=role) pipeline = pipeline_pb2.Pipeline() pipeline.inference_dsl = json_dumps(predict_dsl, byte=True) pipeline.train_dsl = json_dumps(job_dsl, byte=True) pipeline.train_runtime_conf = json_dumps(job_runtime_conf, byte=True) pipeline.fate_version = RuntimeConfig.get_env("FATE") pipeline.model_id = model_id pipeline.model_version = model_version pipeline.parent = True pipeline.loaded_times = 0 pipeline.roles = json_dumps(roles, byte=True) pipeline.work_mode = work_mode pipeline.initiator_role = initiator_role pipeline.initiator_party_id = initiator_party_id pipeline.runtime_conf_on_party = json_dumps(runtime_conf_on_party, byte=True) pipeline.parent_info = json_dumps({}, byte=True) tracker = Tracker(job_id=job_id, role=role, party_id=party_id, model_id=model_id, model_version=model_version) tracker.save_pipelined_model(pipelined_buffer_object=pipeline) if role != 'local': tracker.save_machine_learning_model_info() schedule_logger(job_id).info( 'job {} on {} {} save pipeline successfully'.format( job_id, role, party_id))
def report_task_update_to_driver(cls, task_info): """ Report task update to FATEFlow Server :param task_info: :return: """ schedule_logger().info("report task {} {} {} {} to driver".format( task_info["task_id"], task_info["task_version"], task_info["role"], task_info["party_id"], )) ControllerClient.report_task(task_info=task_info)
def get_table_meta(self, table_info): schedule_logger(self.job_id).info(f'start get table meta:{table_info}') table_meta_dict = storage.StorageTableMeta( namespace=table_info.get("namespace"), name=table_info.get("table_name"), create_address=False).to_dict() schedule_logger( self.job_id).info(f'get table meta success: {table_meta_dict}') table_meta_dict["part_of_data"] = serialize_b64( table_meta_dict["part_of_data"], to_str=True) table_meta_dict["schema"] = serialize_b64(table_meta_dict["schema"], to_str=True) return table_meta_dict
def save_metric_data(self, metric_namespace: str, metric_name: str, metrics: List[Metric], job_level=False): schedule_logger(self.job_id).info( 'save job {} component {} on {} {} {} {} metric data'.format( self.job_id, self.component_name, self.role, self.party_id, metric_namespace, metric_name)) kv = [] for metric in metrics: kv.append((metric.key, metric.value)) self.insert_metrics_into_db(metric_namespace, metric_name, 1, kv, job_level)
def bulk_insert_into_db(self, model, data_source): try: try: DB.create_tables([model]) except Exception as e: schedule_logger(self.job_id).exception(e) batch_size = 50 if RuntimeConfig.USE_LOCAL_DATABASE else 1000 for i in range(0, len(data_source), batch_size): with DB.atomic(): model.insert_many(data_source[i:i + batch_size]).execute() return len(data_source) except Exception as e: schedule_logger(self.job_id).exception(e) return 0
def stop_job(): job_id = request.json.get('job_id') stop_status = request.json.get("stop_status", "canceled") jobs = JobSaver.query_job(job_id=job_id) if jobs: schedule_logger(job_id).info(f"stop job on this party") kill_status, kill_details = JobController.stop_jobs( job_id=job_id, stop_status=stop_status) schedule_logger(job_id).info( f"stop job on this party status {kill_status}") schedule_logger(job_id).info( f"request stop job {jobs[0]} to {stop_status}") status_code, response = FederatedScheduler.request_stop_job( job=jobs[0], stop_status=stop_status, command_body=jobs[0].to_json()) if status_code == FederatedSchedulingStatusCode.SUCCESS: return get_json_result( retcode=RetCode.SUCCESS, retmsg=f"stop job on this party {kill_status};\n" f"stop job on all party success") else: return get_json_result(retcode=RetCode.OPERATING_ERROR, retmsg="stop job on this party {};\n" "stop job failed:\n{}".format( kill_status, json_dumps(response, indent=4))) else: schedule_logger(job_id).info(f"can not found job {job_id} to stop") return get_json_result(retcode=RetCode.DATA_ERROR, retmsg="can not found job")
def update_task(cls, task_info): """ Save to local database and then report to Initiator :param task_info: :return: """ update_status = False try: update_status = JobSaver.update_task(task_info=task_info) cls.report_task_to_initiator(task_info=task_info) except Exception as e: schedule_logger(job_id=task_info["job_id"]).exception(e) finally: return update_status