def _schedule_job(self, job_id): job = Job.query.get(job_id) assert job is not None, 'Job %d not found' % job_id if job.state != JobState.WAITING: return job.state deps = JobDependency.query.filter( JobDependency.dst_job_id == job.id).all() for dep in deps: src_job = Job.query.get(dep.src_job_id) assert src_job is not None, 'Job %d not found' % dep.src_job_id if not src_job.is_complete(): return job.state k8s_client = get_client() yaml = generate_job_run_yaml(job) try: k8s_client.create_or_replace_custom_object(CrdKind.FLAPP, yaml) except RuntimeError as e: logging.error('Start job %d has Runtime error msg: %s', job_id, e.args) return job.state job.start() db.session.commit() return job.state
def get(self): res = {} k8s_client = get_client() deploy = k8s_client.get_deployment('fedlearner-web-console-v2') res['webconsole_image'] = deploy.spec.template.spec.containers[0].image return {'data': res}
def get(self, job_id, pod_name): job = Job.query.filter_by(job=job_id).first() if job is None: raise NotFoundException() k8s = get_client() base = k8s.get_base_url() container_id = k8s.get_webshell_session(job.project.get_namespace(), pod_name, 'tensorflow') return {'data': {'id': container_id, 'base': base}}
def _create_add_on(participant, certificate, custom_host=None): if certificate is None: return # check validation for file_name in _CERTIFICATE_FILE_NAMES: if certificate.certs.get(file_name) is None: raise InvalidArgumentException( details=ErrorMessage.PARAM_FORMAT_ERROR.value.format( 'certificates', '{} not existed'.format(file_name))) try: k8s_client = get_client() create_add_on(k8s_client, participant.domain_name, participant.url, certificate.certs, custom_host) except RuntimeError as e: raise InvalidArgumentException(details=str(e))
def _schedule_job(self, job_id): job = Job.query.get(job_id) assert job is not None, 'Job %d not found' % job_id if job.state != JobState.WAITING: return job.state deps = JobDependency.query.filter( JobDependency.dst_job_id == job.id).all() for dep in deps: src_job = Job.query.get(dep.src_job_id) assert src_job is not None, 'Job %d not found' % dep.src_job_id if not src_job.is_complete(): return job.state k8s_client = get_client() system_dict = { 'basic_envs': os.environ.get( 'BASIC_ENVS', '{"name": "SYSTEM_BASIC_ENVS_DEFAULT",' '"value": ""}') } workflow = job.workflow.to_dict() workflow['variables'] = self._make_variables_dict( job.workflow.get_config().variables) workflow['jobs'] = {} for j in job.workflow.get_jobs(): variables = self._make_variables_dict(j.get_config().variables) j_dic = j.to_dict() j_dic['variables'] = variables workflow['jobs'][j.get_config().name] = j_dic project = job.project.to_dict() project['variables'] = self._make_variables_dict( job.project.get_config().variables) yaml = format_yaml(job.yaml_template, workflow=workflow, project=project, system=system_dict) yaml = json.loads(yaml) try: k8s_client.create_or_replace_custom_object(CrdKind.FLAPP, yaml) except RuntimeError as e: logging.error('Start job %d has Runtime error msg: %s', job_id, e.args) return job.state job.start() db.session.commit() return job.state
def patch(self): parser = reqparse.RequestParser() parser.add_argument('webconsole_image', type=str, required=False, default=None, help='image for webconsole') data = parser.parse_args() if data['webconsole_image']: k8s_client = get_client() deploy = k8s_client.get_deployment('fedlearner-web-console-v2') deploy.spec.template.spec.containers[0].image = \ data['webconsole_image'] k8s_client.create_or_update_deployment( deploy.metadata, deploy.spec, 'fedlearner-web-console-v2') return {'data': {}}
def post(self): parser = reqparse.RequestParser() parser.add_argument('name', required=True, type=str, help=ErrorMessage.PARAM_FORMAT_ERROR.value.format( 'name', 'Empty')) parser.add_argument('config', required=True, type=dict, help=ErrorMessage.PARAM_FORMAT_ERROR.value.format( 'config', 'Empty')) parser.add_argument('comment') data = parser.parse_args() name = data['name'] config = data['config'] comment = data['comment'] if Project.query.filter_by(name=name).first() is not None: raise InvalidArgumentException( details=ErrorMessage.NAME_CONFLICT.value.format(name)) if config.get('participants') is None: raise InvalidArgumentException( details=ErrorMessage.PARAM_FORMAT_ERROR.value.format( 'participants', 'Empty')) if len(config.get('participants')) != 1: # TODO: remove limit after operator supports multiple participants raise InvalidArgumentException( details='Currently not support multiple participants.') certificates = {} for participant in config.get('participants'): if 'name' not in participant.keys() or \ 'url' not in participant.keys() or \ 'domain_name' not in participant.keys(): raise InvalidArgumentException( details=ErrorMessage.PARAM_FORMAT_ERROR.value.format( 'participants', 'Participant must have name, ' 'domain_name and url.')) domain_name = participant.get('domain_name') if participant.get('certificates') is not None: current_cert = parse_certificates( participant.get('certificates')) # check validation for file_name in _CERTIFICATE_FILE_NAMES: if current_cert.get(file_name) is None: raise InvalidArgumentException( details=ErrorMessage.PARAM_FORMAT_ERROR.value. format('certificates', '{} not existed'.format( file_name))) certificates[domain_name] = {'certs': current_cert} participant.pop('certificates') # create add on try: k8s_client = get_client() for domain_name, certificate in certificates.items(): create_add_on(k8s_client, domain_name, participant.get('url'), current_cert) except RuntimeError as e: raise InvalidArgumentException(details=str(e)) new_project = Project() # generate token # If users send a token, then use it instead. # If `token` is None, generate a new one by uuid. config['name'] = name token = config.get('token', uuid4().hex) config['token'] = token # check format of config try: new_project.set_config(ParseDict(config, ProjectProto())) except Exception as e: raise InvalidArgumentException( details=ErrorMessage.PARAM_FORMAT_ERROR.value.format( 'config', e)) new_project.set_certificate( ParseDict({'domain_name_to_cert': certificates}, CertificateStorage())) new_project.name = name new_project.token = token new_project.comment = comment try: new_project = db.session.merge(new_project) db.session.commit() except Exception as e: raise InvalidArgumentException(details=str(e)) return {'data': new_project.to_dict()}
class Job(db.Model): __tablename__ = 'job_v2' id = db.Column(db.Integer, primary_key=True, autoincrement=True) name = db.Column(db.String(255), unique=True) job_type = db.Column(db.Enum(JobType, native_enum=False), nullable=False) state = db.Column(db.Enum(JobState, native_enum=False), nullable=False, default=JobState.INVALID) yaml_template = db.Column(db.Text()) config = db.Column(db.LargeBinary()) workflow_id = db.Column(db.Integer, db.ForeignKey('workflow_v2.id'), nullable=False, index=True) project_id = db.Column(db.Integer, db.ForeignKey(Project.id), nullable=False) flapp_snapshot = db.Column(db.Text()) pods_snapshot = db.Column(db.Text()) created_at = db.Column(db.DateTime(timezone=True), server_default=func.now()) updated_at = db.Column(db.DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) deleted_at = db.Column(db.DateTime(timezone=True)) project = db.relationship(Project) workflow = db.relationship('Workflow') _k8s_client = get_client() def get_config(self): if self.config is not None: proto = JobDefinition() proto.ParseFromString(self.config) return proto return None def _set_snapshot_flapp(self): flapp = self._k8s_client.get_custom_object( CrdKind.FLAPP, self.name, self.project.get_namespace()) self.flapp_snapshot = json.dumps(flapp) def _set_snapshot_pods(self): pods = self._k8s_client.list_resource_of_custom_object( CrdKind.FLAPP, self.name, 'pods', self.project.get_namespace()) self.pods_snapshot = json.dumps(pods) def get_pods(self): if self.state == JobState.STARTED: try: pods = self._k8s_client.list_resource_of_custom_object( CrdKind.FLAPP, self.name, 'pods', self.project.get_namespace()) return pods['pods'] except RuntimeError as e: logging.error('Get %d pods error msg: %s', self.id, e.args) return None if self.pods_snapshot is not None: return json.loads(self.pods_snapshot)['pods'] return None def get_flapp(self): if self.state == JobState.STARTED: try: flapp = self._k8s_client.get_custom_object( CrdKind.FLAPP, self.name, self.project.get_namespace()) return flapp['flapp'] except RuntimeError as e: logging.error('Get %d flapp error msg: %s', self.id, str(e)) return None if self.flapp_snapshot is not None: return json.loads(self.flapp_snapshot)['flapp'] return None def get_pods_for_frontend(self): result = [] flapp = self.get_flapp() if flapp is None: return result if 'status' in flapp \ and 'flReplicaStatus' in flapp['status']: replicas = flapp['status']['flReplicaStatus'] if replicas is None: return result for pod_type in replicas: for state in ['failed', 'succeeded']: for pod in replicas[pod_type][state]: result.append({ 'name': pod, 'status': 'Flapp_{}'.format(state), 'pod_type': pod_type }) # msg from pods pods = self.get_pods() if pods is None: return result pods = pods['items'] for pod in pods: # TODO: make this more readable for frontend pod_for_front = { 'name': pod['metadata']['name'], 'pod_type': pod['metadata']['labels']['fl-replica-type'], 'status': pod['status']['phase'], 'conditions': pod['status']['conditions'] } if 'containerStatuses' in pod['status']: pod_for_front['containers_status'] = \ pod['status']['containerStatuses'] result.append(pod_for_front) # deduplication pods both in pods and flapp result = list({pod['name']: pod for pod in result}.values()) return result def get_state_for_frontend(self): if self.state == JobState.STARTED: if self.is_complete(): return 'COMPLETED' if self.is_failed(): return 'FAILED' return 'RUNNING' if self.state == JobState.STOPPED: if self.get_flapp() is None: return 'NEW' return self.state.name def is_failed(self): flapp = self.get_flapp() if flapp is None \ or 'status' not in flapp \ or 'appState' not in flapp['status']: return False return flapp['status']['appState'] in [ 'FLStateFailed', 'FLStateShutDown' ] def is_complete(self): flapp = self.get_flapp() if flapp is None \ or 'status' not in flapp \ or 'appState' not in flapp['status']: return False return flapp['status']['appState'] == 'FLStateComplete' def get_complete_at(self): flapp = self.get_flapp() if flapp is None \ or 'status' not in flapp \ or 'complete_at' not in flapp['status']: return None return flapp['status']['complete_at'] def stop(self): if self.state == JobState.STARTED: self._set_snapshot_flapp() self._set_snapshot_pods() self._k8s_client.delete_custom_object(CrdKind.FLAPP, self.name, self.project.get_namespace()) self.state = JobState.STOPPED def schedule(self): assert self.state == JobState.STOPPED self.pods_snapshot = None self.flapp_snapshot = None self.state = JobState.WAITING def start(self): self.state = JobState.STARTED def set_yaml_template(self, yaml_template): self.yaml_template = yaml_template
class Job(db.Model): __tablename__ = 'job_v2' __table_args__ = (Index('idx_workflow_id', 'workflow_id'), { 'comment': 'webconsole job', 'mysql_engine': 'innodb', 'mysql_charset': 'utf8mb4', }) id = db.Column(db.Integer, primary_key=True, autoincrement=True, comment='id') name = db.Column(db.String(255), unique=True, comment='name') job_type = db.Column(db.Enum(JobType, native_enum=False), nullable=False, comment='job type') state = db.Column(db.Enum(JobState, native_enum=False), nullable=False, default=JobState.INVALID, comment='state') yaml_template = db.Column(db.Text(), comment='yaml_template') config = db.Column(db.LargeBinary(), comment='config') is_disabled = db.Column(db.Boolean(), default=False, comment='is_disabled') workflow_id = db.Column(db.Integer, nullable=False, comment='workflow id') project_id = db.Column(db.Integer, nullable=False, comment='project id') flapp_snapshot = db.Column(db.Text(), comment='flapp snapshot') pods_snapshot = db.Column(db.Text(), comment='pods snapshot') created_at = db.Column(db.DateTime(timezone=True), server_default=func.now(), comment='created at') updated_at = db.Column(db.DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), comment='updated at') deleted_at = db.Column(db.DateTime(timezone=True), comment='deleted at') project = db.relationship('Project', primaryjoin='Project.id == ' 'foreign(Job.project_id)') workflow = db.relationship('Workflow', primaryjoin='Workflow.id == ' 'foreign(Job.workflow_id)') _k8s_client = get_client() def get_config(self): if self.config is not None: proto = JobDefinition() proto.ParseFromString(self.config) return proto return None def _set_snapshot_flapp(self): flapp = self._k8s_client.get_custom_object( CrdKind.FLAPP, self.name, self.project.get_namespace()) self.flapp_snapshot = json.dumps(flapp) def _set_snapshot_pods(self): pods = self._k8s_client.list_resource_of_custom_object( CrdKind.FLAPP, self.name, 'pods', self.project.get_namespace()) self.pods_snapshot = json.dumps(pods) def get_pods(self): if self.state == JobState.STARTED: try: pods = self._k8s_client.list_resource_of_custom_object( CrdKind.FLAPP, self.name, 'pods', self.project.get_namespace()) return pods['pods'] except RuntimeError as e: logging.error('Get %d pods error msg: %s', self.id, e.args) return None if self.pods_snapshot is not None: return json.loads(self.pods_snapshot)['pods'] return None def get_flapp(self): if self.state == JobState.STARTED: try: flapp = self._k8s_client.get_custom_object( CrdKind.FLAPP, self.name, self.project.get_namespace()) return flapp['flapp'] except RuntimeError as e: logging.error('Get %d flapp error msg: %s', self.id, str(e)) return None if self.flapp_snapshot is not None: return json.loads(self.flapp_snapshot)['flapp'] return None def get_pods_for_frontend(self, filter_private_info=False): result = [] flapp = self.get_flapp() if flapp is None: return result if 'status' in flapp \ and 'flReplicaStatus' in flapp['status']: replicas = flapp['status']['flReplicaStatus'] if replicas: for pod_type in replicas: for state in ['failed', 'succeeded']: for pod in replicas[pod_type][state]: result.append({ 'name': pod, 'pod_type': pod_type, 'status': 'Flapp_{}'.format(state), 'message': '', }) # msg from pods pods = self.get_pods() if pods is None: return result pods = pods['items'] for pod in pods: status = pod['status']['phase'].lower() msgs = [] if 'containerStatuses' in pod['status']: state = pod['status']['containerStatuses'][0]['state'] for key, detail in state.items(): if filter_private_info: if 'reason' in detail: msgs.append(key + ':' + detail['reason']) elif 'message' in detail: msgs.append(key + ':' + detail['message']) for cond in pod['status']['conditions']: if filter_private_info: if 'reason' in cond: msgs.append(cond['type'] + ':' + cond['reason']) elif 'message' in cond: msgs.append(cond['type'] + ':' + cond['message']) result.append({ 'name': pod['metadata']['name'], 'pod_type': pod['metadata']['labels']['fl-replica-type'], 'status': status, 'message': ', '.join(msgs) }) # deduplication pods both in pods and flapp result = list({pod['name']: pod for pod in result}.values()) return result def get_state_for_frontend(self): if self.state == JobState.STARTED: if self.is_complete(): return 'COMPLETED' if self.is_failed(): return 'FAILED' return 'RUNNING' if self.state == JobState.STOPPED: if self.get_flapp() is None: return 'NEW' return self.state.name def is_failed(self): flapp = self.get_flapp() if flapp is None \ or 'status' not in flapp \ or 'appState' not in flapp['status']: return False return flapp['status']['appState'] in [ 'FLStateFailed', 'FLStateShutDown' ] def is_complete(self): flapp = self.get_flapp() if flapp is None \ or 'status' not in flapp \ or 'appState' not in flapp['status']: return False return flapp['status']['appState'] == 'FLStateComplete' def get_complete_at(self): flapp = self.get_flapp() if flapp is None \ or 'status' not in flapp \ or 'complete_at' not in flapp['status']: return None return flapp['status']['complete_at'] def stop(self): if self.state == JobState.STARTED: self._set_snapshot_flapp() self._set_snapshot_pods() self._k8s_client.delete_custom_object(CrdKind.FLAPP, self.name, self.project.get_namespace()) self.state = JobState.STOPPED def schedule(self): assert self.state == JobState.STOPPED self.pods_snapshot = None self.flapp_snapshot = None self.state = JobState.WAITING def start(self): self.state = JobState.STARTED def set_yaml_template(self, yaml_template): self.yaml_template = yaml_template
class Job(db.Model): __tablename__ = 'job_v2' id = db.Column(db.Integer, primary_key=True, autoincrement=True) name = db.Column(db.String(255), unique=True) job_type = db.Column(db.Enum(JobType), nullable=False) state = db.Column(db.Enum(JobState), nullable=False, default=JobState.INVALID) yaml_template = db.Column(db.Text(), nullable=False) config = db.Column(db.Text(), nullable=False) workflow_id = db.Column(db.Integer, db.ForeignKey('workflow_v2.id'), nullable=False, index=True) project_id = db.Column(db.Integer, db.ForeignKey(Project.id), nullable=False) flapp_snapshot = db.Column(db.Text()) pods_snapshot = db.Column(db.Text()) created_at = db.Column(db.DateTime(timezone=True), server_default=func.now()) updated_at = db.Column(db.DateTime(timezone=True), server_default=func.now(), onupdate=func.now()) deleted_at = db.Column(db.DateTime(timezone=True)) project = db.relationship(Project) workflow = db.relationship('Workflow') _k8s_client = get_client() def get_config(self): if self.config is not None: proto = JobDefinition() proto.ParseFromString(self.config) return proto return None def _set_snapshot_flapp(self): flapp = self._k8s_client.get_custom_object( CrdKind.FLAPP, self.name, self.project.get_namespace()) self.flapp_snapshot = json.dumps(flapp) def _set_snapshot_pods(self): pods = self._k8s_client.list_resource_of_custom_object( CrdKind.FLAPP, self.name, 'pods', self.project.get_namespace()) self.pods_snapshot = json.dumps(pods) def get_flapp(self): if self.state == JobState.STARTED: return self._k8s_client.list_resource_of_custom_object( CrdKind.FLAPP, self.name, 'pods', self.project.get_namespace()) if self.flapp_snapshot is not None: return json.loads(self.flapp_snapshot) return None def get_pods(self): if self.state == JobState.STARTED: return self._k8s_client.list_resource_of_custom_object( CrdKind.FLAPP, self.name, 'pods', self.project.get_namespace()) if self.pods_snapshot is not None: return json.loads(self.pods_snapshot) return None def get_pods_for_front(self): result = [] flapp = self.get_flapp() if flapp is not None \ and 'status' in flapp \ and 'flReplicaStatus' in flapp['status']: replicas = flapp['status']['flReplicaStatus'] for pod_type in replicas: for state in replicas[pod_type]: for pod in replicas[pod_type][state]: result.append({ 'name': pod, 'state': state, 'pod_type': pod_type }) return result def get_state_for_front(self): if self.state == JobState.STARTED: if self.is_complete(): return 'COMPLETE' if self.is_failed(): return 'FAILED' return 'RUNNING' if self.state == JobState.STOPPED: if self.get_flapp() is None: return 'NEW' return self.state.name def is_failed(self): flapp = self.get_flapp() if flapp is None \ or 'status' not in flapp \ or 'appState' not in flapp['status']: return False return flapp['status']['appState'] in [ 'FLStateFailed', 'FLStateShutDown' ] def is_complete(self): flapp = self.get_flapp() if flapp is None \ or 'status' not in flapp \ or 'appState' not in flapp['status']: return False return flapp['status']['appState'] == 'FLStateComplete' def get_complete_at(self): flapp = self.get_flapp() if flapp is None \ or 'status' not in flapp \ or 'complete_at' not in flapp['status']: return None return flapp['status']['complete_at'] def stop(self): if self.state == JobState.STARTED: self._set_snapshot_flapp() self._set_snapshot_pods() self._k8s_client.delete_custom_object(CrdKind.FLAPP, self.name, self.project.get_namespace()) self.state = JobState.STOPPED def schedule(self): assert self.state == JobState.STOPPED self.pods_snapshot = None self.flapp_snapshot = None self.state = JobState.WAITING def start(self): self.state = JobState.STARTED def set_yaml_template(self, yaml_template): self.yaml_template = yaml_template
def get(self, job_id, pod_name): k8s = get_client() base = k8s.get_base_url() container_id = k8s.get_webshell_session( ProjectK8sAdapter(job_id).get_namespace(), pod_name, 'tensorflow') return {'data': {'id': container_id, 'base': base}}
class Job(db.Model): __tablename__ = 'job_v2' id = db.Column(db.Integer, primary_key=True, autoincrement=True) name = db.Column(db.String(255), unique=True) job_type = db.Column(db.String(16), nullable=False) status = db.Column(db.Enum(JobStatus), nullable=False) yaml = db.Column(db.Text(), nullable=False) workflow_id = db.Column(db.Integer, db.ForeignKey(Workflow.id), nullable=False, index=True) project_id = db.Column(db.Integer, db.ForeignKey(Project.id), nullable=False) flapp_snapshot = db.Column(db.Text()) pods_snapshot = db.Column(db.Text()) created_at = db.Column(db.DateTime(timezone=True), server_default=func.now()) updated_at = db.Column(db.DateTime(timezone=True), server_default=func.now(), server_onupdate=func.now()) deleted_at = db.Column(db.DateTime(timezone=True)) _project_adapter = ProjectK8sAdapter(project_id) _k8s_client = get_client() def _set_snapshot_flapp(self): flapp = json.dumps(self._k8s_client.get_flapp(self. _project_adapter.get_namespace(), self.name)) self.flapp_snapshot = json.dumps(flapp) def _set_snapshot_pods(self): flapp = json.dumps(self._k8s_client.get_pods(self. _project_adapter.get_namespace(), self.name)) self.flapp_snapshot = json.dumps(flapp) def get_flapp(self): if self.status == JobStatus.STARTED: self._set_snapshot_flapp() return json.loads(self.flapp_snapshot) def get_pods(self): if self.status == JobStatus.STARTED: self._set_snapshot_pods() return json.loads(self.pods_snapshot) def run(self): if self.status == JobStatus.STARTED: raise ResourceConflictException('Job has been started') self.status = JobStatus.STARTED self._k8s_client.create_flapp(self._project_adapter. get_namespace(), self.yaml) def stop(self): if self.status == JobStatus.STOPPED: raise ResourceConflictException('Job has stopped') self.status = JobStatus.STOPPED self._set_snapshot_flapp() self._set_snapshot_pods() self._k8s_client.delete_flapp(self._project_adapter. get_namespace(), self.name) def set_yaml(self, yaml_template, job_config): yaml = merge(yaml_template, self._project_adapter.get_global_job_spec())