def test_is_peer_job_inheritance_matched(self, mock_get_workflow): peer_job_0 = JobDefinition(name='raw-data-job') peer_job_1 = JobDefinition(name='train-job', is_federated=True) peer_config = WorkflowDefinition() peer_config.job_definitions.extend([peer_job_0, peer_job_1]) resp = GetWorkflowResponse(config=peer_config) mock_get_workflow.return_value = resp job_0 = JobDefinition(name='train-job', is_federated=True) config = WorkflowDefinition(job_definitions=[job_0]) project = Project() participant = project_pb2.Participant() project.set_config(project_pb2.Project(participants=[participant])) workflow0 = Workflow(project=project) workflow0.set_config(config) db.session.add(workflow0) db.session.commit() db.session.flush() workflow1 = Workflow(project=project, forked_from=workflow0.id) workflow1.set_config(config) workflow1.set_create_job_flags([CreateJobFlag.REUSE]) workflow1.set_peer_create_job_flags( [CreateJobFlag.NEW, CreateJobFlag.REUSE]) self.assertTrue(is_peer_job_inheritance_matched(workflow1)) workflow1.set_create_job_flags([CreateJobFlag.NEW]) self.assertFalse(is_peer_job_inheritance_matched(workflow1))
def make_workflow_template(): workflow = WorkflowDefinition( group_alias='psi_join_tree_model', is_left=False, variables=[ Variable(name='image_version', value='v1.5-rc3', access_mode=Variable.PEER_READABLE), Variable(name='num_partitions', value='2', access_mode=Variable.PEER_WRITABLE), ], job_definitions=[ JobDefinition( name='raw-data-job', job_type=JobDefinition.RAW_DATA, is_federated=False, variables=[ Variable( name='input_dir', value='/app/deploy/integrated_test/tfrecord_raw_data', access_mode=Variable.PRIVATE), Variable(name='file_wildcard', value='*.rd', access_mode=Variable.PRIVATE), Variable(name='batch_size', value='1024', access_mode=Variable.PEER_WRITABLE), Variable(name='input_format', value='TF_RECORD', access_mode=Variable.PRIVATE), Variable(name='worker_cpu', value='2000m', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='4Gi', access_mode=Variable.PEER_WRITABLE), ], yaml_template='''{ "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.raw-data-job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "cleanPodPolicy": "All", "flReplicaSpecs": { "Master": { "template": { "spec": { "containers": [ { "resources": { "limits": { "cpu": "1000m", "memory": "2Gi" }, "requests": { "cpu": "1000m", "memory": "2Gi" } }, "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "command": [ "/app/deploy/scripts/data_portal/run_data_portal_master.sh" ], "args": [], "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.raw-data-job.name}" }, { "name": "DATA_PORTAL_NAME", "value": "${workflow.jobs.raw-data-job.name}" }, { "name": "OUTPUT_PARTITION_NUM", "value": "${workflow.variables.num_partitions}" }, { "name": "INPUT_BASE_DIR", "value": "${workflow.jobs.raw-data-job.variables.input_dir}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/raw_data/${workflow.jobs.raw-data-job.name}" }, { "name": "RAW_DATA_PUBLISH_DIR", "value": "portal_publish_dir/${workflow.jobs.raw-data-job.name}" }, { "name": "DATA_PORTAL_TYPE", "value": "PSI" }, { "name": "FILE_WILDCARD", "value": "${workflow.jobs.raw-data-job.variables.file_wildcard}" } ], "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow" } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ], "restartPolicy": "Never" } }, "pair": false, "replicas": 1 }, "Worker": { "replicas": ${workflow.variables.num_partitions}, "template": { "spec": { "containers": [ { "resources": { "limits": { "cpu": "${workflow.jobs.raw-data-job.variables.worker_cpu}", "memory": "${workflow.jobs.raw-data-job.variables.worker_mem}" }, "requests": { "cpu": "${workflow.jobs.raw-data-job.variables.worker_cpu}", "memory": "${workflow.jobs.raw-data-job.variables.worker_mem}" } }, "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "command": [ "/app/deploy/scripts/data_portal/run_data_portal_worker.sh" ], "args": [], "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.raw-data-job.name}" }, { "name": "BATCH_SIZE", "value": "${workflow.jobs.raw-data-job.variables.batch_size}" }, { "name": "INPUT_DATA_FORMAT", "value": "${workflow.jobs.raw-data-job.variables.input_format}" }, { "name": "COMPRESSED_TYPE", "value": "" }, { "name": "OUTPUT_DATA_FORMAT", "value": "TF_RECORD" } ], "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow" } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ], "restartPolicy": "Never" } }, "pair": false } } } } '''), JobDefinition(name='data-join-job', job_type=JobDefinition.PSI_DATA_JOIN, is_federated=True, variables=[ Variable(name='worker_cpu', value='4000m', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='4Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='rsa_private_key_path', value='', access_mode=Variable.PRIVATE), ], dependencies=[JobDependency(source='raw-data-job')], yaml_template=''' { "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.data-join-job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "role": "Leader", "cleanPodPolicy": "All", "peerSpecs": { "Follower": { "peerURL": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80", "authority": "${project.participants[0].egress_domain}", "extraHeaders": { "x-host": "default.fedlearner.operator" } } }, "flReplicaSpecs": { "Master": { "template": { "spec": { "restartPolicy": "Never", "containers": [ { "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.data-join-job.name}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "ROLE", "value": "leader" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/data_source/${workflow.jobs.data-join-job.name}" }, { "name": "PARTITION_NUM", "value": "${workflow.variables.num_partitions}" }, { "name": "START_TIME", "value": "0" }, { "name": "END_TIME", "value": "999999999999" }, { "name": "NEGATIVE_SAMPLING_RATE", "value": "1.0" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.raw-data-job.name}" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "args": [ "/app/deploy/scripts/rsa_psi/run_psi_data_join_master.sh" ], "resources": { "limits": { "cpu": "2000m", "memory": "3Gi" }, "requests": { "cpu": "2000m", "memory": "3Gi" } }, } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ] } }, "pair": true, "replicas": 1 }, "Worker": { "template": { "spec": { "restartPolicy": "Never", "containers": [ { "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "ROLE", "value": "follower" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.data-join-job.name}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/data_source/${workflow.jobs.data-join-job.name}" }, { "name": "RSA_KEY_PATH", "value": "${workflow.jobs.data-join-job.rsa_private_key_path}" }, { "name": "RSA_PRIVATE_KEY_PATH", "value": "${workflow.jobs.data-join-job.rsa_private_key_path}" }, { "name": "PSI_RAW_DATA_ITER", "value": "TF_RECORD" }, { "name": "PSI_OUTPUT_BUILDER", "value": "TF_RECORD" }, { "name": "DATA_BLOCK_BUILDER", "value": "TF_RECORD" }, { "name": "DATA_BLOCK_DUMP_INTERVAL", "value": "600" }, { "name": "DATA_BLOCK_DUMP_THRESHOLD", "value": "524288" }, { "name": "EXAMPLE_ID_DUMP_INTERVAL", "value": "600" }, { "name": "EXAMPLE_ID_DUMP_THRESHOLD", "value": "524288" }, { "name": "EXAMPLE_JOINER", "value": "SORT_RUN_JOINER" }, { "name": "SIGN_RPC_TIMEOUT_MS", "value": "128000" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.raw-data-job.name}" }, { "name": "PARTITION_NUM", "value": "${workflow.variables.num_partitions}" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "args": [ "/app/deploy/scripts/rsa_psi/run_psi_data_join_worker.sh" ], "resources": { "limits": { "cpu": "${workflow.jobs.data-join-job.variables.worker_cpu}", "memory": "${workflow.jobs.data-join-job.variables.worker_mem}" }, "requests": { "cpu": "${workflow.jobs.data-join-job.variables.worker_cpu}", "memory": "${workflow.jobs.data-join-job.variables.worker_mem}" } } } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ] } }, "pair": true, "replicas": ${workflow.variables.num_partitions} } } } } '''), JobDefinition(name='train-job', job_type=JobDefinition.TREE_MODEL_TRAINING, is_federated=True, variables=[ Variable(name='worker_cpu', value='4000m', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='8Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='send_scores_to_follower', value='True', access_mode=Variable.PEER_WRITABLE), Variable(name='send_metrics_to_follower', value='True', access_mode=Variable.PEER_WRITABLE), Variable(name='num_parallel', value='4', access_mode=Variable.PEER_WRITABLE), ], dependencies=[JobDependency(source='data-join-job')], yaml_template=''' { "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.train-job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "role": "Leader", "cleanPodPolicy": "All", "peerSpecs": { "Leader": { "peerURL": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80", "authority": "${project.participants[0].egress_domain}", "extraHeaders": { "x-host": "default.fedlearner.operator" } } }, "flReplicaSpecs": { "Worker": { "template": { "spec": { "restartPolicy": "Never", "containers": [ { "env": [ ${system.basic_envs}, { "name": "EGRESS_URL", "value": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" }, { "name": "EGRESS_HOST", "value": "${project.participants[0].egress_host}" }, { "name": "EGRESS_DOMAIN", "value": "${project.participants[0].egress_domain}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.train-job.name}" }, { "name": "STORAGE_ROOT_PATH", "value": "${project.variables.storage_root_dir}" }, { "name": "ROLE", "value": "leader" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/job_output/${workflow.jobs.train-job.name}" }, { "name": "MODE", "value": "train" }, { "name": "SEND_SCORES_TO_FOLLOWER", "value": "${workflow.jobs.train-job.variables.send_scores_to_follower}" }, { "name": "SEND_METRICS_TO_FOLLOWER", "value": "${workflow.jobs.train-job.variables.send_metrics_to_follower}" }, { "name": "NUM_PARALLEL", "value": "${workflow.jobs.train-job.variables.num_parallel}" }, { "name": "DATA_SOURCE", "value": "${workflow.jobs.data-join-job.name}" } ], "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "volumeMounts": [ { "mountPath": "/data", "name": "data" } ], "image": "artifact.bytedance.com/fedlearner/fedlearner:${workflow.variables.image_version}", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "args": [ "/app/deploy/scripts/trainer/run_tree_worker.sh" ], "resources": { "limits": { "cpu": "${workflow.jobs.train-job.variables.worker_cpu}", "memory": "${workflow.jobs.train-job.variables.worker_mem}" }, "requests": { "cpu": "${workflow.jobs.train-job.variables.worker_cpu}", "memory": "${workflow.jobs.train-job.variables.worker_mem}" } } } ], "imagePullSecrets": [ { "name": "regcred" } ], "volumes": [ { "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" }, "name": "data" } ] } }, "pair": true, "replicas": 1 } } } } ''') ]) return workflow
def get_config(self): if self.config is not None: proto = JobDefinition() proto.ParseFromString(self.config) return proto return None
def test_generate_self_dict(self): config = { 'variables': [ { 'name': 'namespace', 'value': 'leader' }, { 'name': 'basic_envs', 'value': '{}' }, { 'name': 'storage_root_dir', 'value': '/' }, { 'name': 'EGRESS_URL', 'value': '127.0.0.1:1991' } ] } job = Job(name='aa', project_id=1, workflow_id=1, state=JobState.NEW) job.set_config(ParseDict(config, JobDefinition())) self.assertEqual(generate_self_dict(job), {'id': None, 'name': 'aa', 'job_type': None, 'state': 'NEW', 'config': {'expert_mode': False, 'variables': [ { 'name': 'namespace', 'value': 'leader', 'access_mode': 'UNSPECIFIED', 'widget_schema': '', 'value_type': 'STRING'}, { 'name': 'basic_envs', 'value': '{}', 'access_mode': 'UNSPECIFIED', 'widget_schema': '', 'value_type': 'STRING'}, { 'name': 'storage_root_dir', 'value': '/', 'access_mode': 'UNSPECIFIED', 'widget_schema': '', 'value_type': 'STRING'}, { 'name': 'EGRESS_URL', 'value': '127.0.0.1:1991', 'access_mode': 'UNSPECIFIED', 'widget_schema': '', 'value_type': 'STRING'}], 'name': '', 'job_type': 'UNSPECIFIED', 'is_federated': False, 'dependencies': [], 'yaml_template': ''}, 'is_disabled': None, 'workflow_id': 1, 'project_id': 1, 'flapp_snapshot': None, 'pods_snapshot': None, 'error_message': None, 'created_at': None, 'updated_at': None, 'deleted_at': None, 'pods': [], 'complete_at': None, 'variables': {'namespace': 'leader', 'basic_envs': '{}', 'storage_root_dir': '/', 'EGRESS_URL': '127.0.0.1:1991'}} )
def make_workflow_template(): workflow = WorkflowDefinition( group_alias='test_template', is_left=True, variables=[ Variable(name='image_version', value='v1.5-rc3', access_mode=Variable.PEER_READABLE), Variable(name='num_partitions', value='4', access_mode=Variable.PEER_WRITABLE), ], job_definitions=[ JobDefinition( name='raw_data_job', job_type=JobDefinition.RAW_DATA, is_federated=False, is_manual=False, variables=[ Variable( name='input_dir', value='/app/deploy/integrated_test/tfrecord_raw_data', access_mode=Variable.PRIVATE), Variable(name='file_wildcard', value='*.rd', access_mode=Variable.PRIVATE), Variable(name='batch_size', value='1024', access_mode=Variable.PEER_WRITABLE), Variable(name='input_format', value='TF_RECORD', access_mode=Variable.PRIVATE), Variable(name='output_format', value='TF_RECORD', access_mode=Variable.PRIVATE), Variable(name='master_cpu', value='2', access_mode=Variable.PEER_WRITABLE), Variable(name='master_mem', value='3Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_cpu', value='2', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='3Gi', access_mode=Variable.PEER_WRITABLE), ], yaml_template='''{ "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.raw_data_job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "cleanPodPolicy": "All", "flReplicaSpecs": { "Master": { "pair": false, "replicas": 1, "template": { "spec": { "containers": [ { "command": [ "/app/deploy/scripts/data_portal/run_data_portal_master.sh" ], "env": [ { "name": "POD_IP", "valueFrom": { "fieldRef": { "fieldPath": "status.podIP" } } }, { "name": "POD_NAME", "valueFrom": { "fieldRef": { "fieldPath": "metadata.name" } } }, ${system.basic_envs}, ${project.variables.basic_envs}, { "name": "APPLICATION_ID", "value": "${workflow.jobs.raw_data_job.name}" }, { "name": "DATA_PORTAL_NAME", "value": "${workflow.jobs.raw_data_job.name}" }, { "name": "OUTPUT_PARTITION_NUM", "value": "${workflow.variables.num_partitions}" }, { "name": "INPUT_BASE_DIR", "value": "${workflow.jobs.raw_data_job.variables.input_dir}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/raw_data/${workflow.jobs.raw_data_job.name}" }, { "name": "RAW_DATA_PUBLISH_DIR", "value": "portal_publish_dir/${workflow.jobs.raw_data_job.name}" }, { "name": "DATA_PORTAL_TYPE", "value": "Streaming" }, { "name": "FILE_WILDCARD", "value": "${workflow.jobs.raw_data_job.variables.file_wildcard}" } ], "image": "hub.docker.com/fedlearner/fedlearner:${workflow.variables.image_version}", "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "resources": { "limits": { "cpu": "${workflow.jobs.raw_data_job.variables.master_cpu}", "memory": "${workflow.jobs.raw_data_job.variables.master_mem}" }, "requests": { "cpu": "${workflow.jobs.raw_data_job.variables.master_cpu}", "memory": "${workflow.jobs.raw_data_job.variables.master_mem}" } }, "volumeMounts": [ { "mountPath": "/data", "name": "data" } ] } ], "imagePullSecrets": [ { "name": "regcred" } ], "restartPolicy": "Never", "volumes": [ { "name": "data", "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" } } ] } } }, "Worker": { "pair": false, "replicas": ${workflow.variables.num_partitions}, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "command": [ "/app/deploy/scripts/data_portal/run_data_portal_worker.sh" ], "env": [ { "name": "POD_IP", "valueFrom": { "fieldRef": { "fieldPath": "status.podIP" } } }, { "name": "POD_NAME", "valueFrom": { "fieldRef": { "fieldPath": "metadata.name" } } }, ${system.basic_envs}, ${project.variables.basic_envs}, { "name": "CPU_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.cpu" } } }, { "name": "MEM_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.memory" } } }, { "name": "CPU_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.cpu" } } }, { "name": "MEM_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.memory" } } }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.raw_data_job.name}" }, { "name": "BATCH_SIZE", "value": "${workflow.jobs.raw_data_job.variables.batch_size}" }, { "name": "INPUT_DATA_FORMAT", "value": "${workflow.jobs.raw_data_job.variables.input_format}" }, { "name": "COMPRESSED_TYPE" }, { "name": "OUTPUT_DATA_FORMAT", "value": "${workflow.jobs.raw_data_job.variables.output_format}" } ], "image": "hub.docker.com/fedlearner/fedlearner:${workflow.variables.image_version}", "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "resources": { "limits": { "cpu": "${workflow.jobs.raw_data_job.variables.worker_cpu}", "memory": "${workflow.jobs.raw_data_job.variables.worker_mem}" }, "requests": { "cpu": "${workflow.jobs.raw_data_job.variables.worker_cpu}", "memory": "${workflow.jobs.raw_data_job.variables.worker_mem}" } }, "volumeMounts": [ { "mountPath": "/data", "name": "data" } ] } ], "imagePullSecrets": [ { "name": "regcred" } ], "restartPolicy": "Never", "volumes": [ { "name": "data", "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" } } ] } } } }, "peerSpecs": { "Leader": { "peerURL": "" } }, "role": "Follower" } } '''), JobDefinition(name='data_join_job', job_type=JobDefinition.DATA_JOIN, is_federated=True, is_manual=False, variables=[ Variable(name='master_cpu', value='2', access_mode=Variable.PEER_WRITABLE), Variable(name='master_mem', value='3Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_cpu', value='2', access_mode=Variable.PEER_WRITABLE), Variable(name='worker_mem', value='3Gi', access_mode=Variable.PEER_WRITABLE), Variable(name='role', value='Follower', access_mode=Variable.PEER_WRITABLE), ], dependencies=[JobDependency(source='raw_data_job')], yaml_template=''' { "apiVersion": "fedlearner.k8s.io/v1alpha1", "kind": "FLApp", "metadata": { "name": "${workflow.jobs.data_join_job.name}", "namespace": "${project.variables.namespace}" }, "spec": { "cleanPodPolicy": "All", "flReplicaSpecs": { "Master": { "pair": true, "replicas": 1, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "args": [ "/app/deploy/scripts/data_join/run_data_join_master.sh" ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "env": [ { "name": "POD_IP", "valueFrom": { "fieldRef": { "fieldPath": "status.podIP" } } }, { "name": "POD_NAME", "valueFrom": { "fieldRef": { "fieldPath": "metadata.name" } } }, ${system.basic_envs}, ${project.variables.basic_envs}, { "name": "ROLE", "value": "${workflow.jobs.data_join_job.variables.role}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.data_join_job.name}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/data_source/${workflow.jobs.data_join_job.name}" }, { "name": "CPU_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.cpu" } } }, { "name": "MEM_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.memory" } } }, { "name": "CPU_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.cpu" } } }, { "name": "MEM_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.memory" } } }, { "name": "BATCH_MODE", "value": "--batch_mode" }, { "name": "PARTITION_NUM", "value": "${workflow.jobs.raw_data_job.variables.num_partitions}" }, { "name": "START_TIME", "value": "0" }, { "name": "END_TIME", "value": "999999999999" }, { "name": "NEGATIVE_SAMPLING_RATE", "value": "1.0" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.data_join_job.name}" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.data_join_job.name}" }, { "name": "PARTITION_NUM", "value": "${workflow.jobs.raw_data_job.variables.num_partitions}" } ], "image": "hub.docker.com/fedlearner/fedlearner:${workflow.variables.image_version}", "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "resources": { "limits": { "cpu": "${workflow.jobs.data_join_job.variables.master_cpu}", "memory": "${workflow.jobs.data_join_job.variables.master_mem}" }, "requests": { "cpu": "${workflow.jobs.data_join_job.variables.master_cpu}", "memory": "${workflow.jobs.data_join_job.variables.master_mem}" } }, "volumeMounts": [ { "mountPath": "/data", "name": "data" } ] } ], "imagePullSecrets": [ { "name": "regcred" } ], "restartPolicy": "Never", "volumes": [ { "name": "data", "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" } } ] } } }, "Worker": { "pair": true, "replicas": ${workflow.jobs.raw_data_job.variables.num_partitions}, "template": { "metadata": { "creationTimestamp": null }, "spec": { "containers": [ { "args": [ "/app/deploy/scripts/data_join/run_data_join_worker.sh" ], "command": [ "/app/deploy/scripts/wait4pair_wrapper.sh" ], "env": [ { "name": "POD_IP", "valueFrom": { "fieldRef": { "fieldPath": "status.podIP" } } }, { "name": "POD_NAME", "valueFrom": { "fieldRef": { "fieldPath": "metadata.name" } } }, ${system.basic_envs}, ${project.variables.basic_envs}, { "name": "ROLE", "value": "${workflow.jobs.data_join_job.variables.role}" }, { "name": "APPLICATION_ID", "value": "${workflow.jobs.data_join_job.name}" }, { "name": "OUTPUT_BASE_DIR", "value": "${project.variables.storage_root_dir}/data_source/${workflow.jobs.data_join_job.name}" }, { "name": "CPU_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.cpu" } } }, { "name": "MEM_REQUEST", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "requests.memory" } } }, { "name": "CPU_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.cpu" } } }, { "name": "MEM_LIMIT", "valueFrom": { "resourceFieldRef": { "divisor": "0", "resource": "limits.memory" } } }, { "name": "PARTITION_NUM", "value": "${workflow.jobs.raw_data_job.variables.num_partitions}" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.data_join_job.name}" }, { "name": "DATA_BLOCK_DUMP_INTERVAL", "value": "600" }, { "name": "DATA_BLOCK_DUMP_THRESHOLD", "value": "65536" }, { "name": "EXAMPLE_ID_DUMP_INTERVAL", "value": "600" }, { "name": "EXAMPLE_ID_DUMP_THRESHOLD", "value": "65536" }, { "name": "EXAMPLE_ID_BATCH_SIZE", "value": "4096" }, { "name": "MAX_FLYING_EXAMPLE_ID", "value": "307152" }, { "name": "MIN_MATCHING_WINDOW", "value": "2048" }, { "name": "MAX_MATCHING_WINDOW", "value": "8192" }, { "name": "RAW_DATA_ITER", "value": "${workflow.jobs.raw_data_job.variables.output_format}" }, { "name": "RAW_DATA_SUB_DIR", "value": "portal_publish_dir/${workflow.jobs.raw_data_job.name}" }, { "name": "PARTITION_NUM", "value": "${workflow.jobs.raw_data_job.variables.num_partitions}" } ], "image": "artifact.bytedance.com/fedlearner/fedlearner:5b499dd", "imagePullPolicy": "IfNotPresent", "name": "tensorflow", "ports": [ { "containerPort": 50051, "name": "flapp-port" } ], "resources": { "limits": { "cpu": "${workflow.jobs.data_join_job.variables.master_cpu}", "memory": "${workflow.jobs.data_join_job.variables.master_mem}" }, "requests": { "cpu": "${workflow.jobs.data_join_job.variables.master_cpu}", "memory": "${workflow.jobs.data_join_job.variables.master_mem}" } }, "volumeMounts": [ { "mountPath": "/data", "name": "data" } ] } ], "imagePullSecrets": [ { "name": "regcred" } ], "restartPolicy": "Never", "volumes": [ { "name": "data", "persistentVolumeClaim": { "claimName": "pvc-fedlearner-default" } } ] } } } }, "peerSpecs": { "Follower": { "authority": "external.name", "extraHeaders": { "x-host": "leader.flapp.operator" }, "peerURL": "fedlearner-stack-ingress-nginx-controller.default.svc.cluster.local:80" } }, "role": "Leader" } } ''') ]) return workflow