示例#1
0
    def __init__(
        self,
        artifact_type,
        path,
        accesskey_id=None,
        accesskey_secret=None,
        bucket=None,
        key=None,
        endpoint="",
        is_global=False,
    ):
        self.type = artifact_type
        self.id = f"output-{self.type}-{utils._get_uuid()}"
        # path is used for local path
        self.path = path
        self.is_global = is_global
        self.bucket = bucket
        self.key = key
        self.endpoint = endpoint

        if accesskey_id and accesskey_secret:
            secret = {"accessKey": accesskey_id, "secretKey": accesskey_secret}
            # TODO: check this secret exist or not
            self.secret = couler.create_secret(secret)
        else:
            self.secret = None
示例#2
0
    def __init__(
        self,
        artifact_type,
        path,
        accesskey_id,
        accesskey_secret,
        bucket,
        key=None,
        endpoint="",
        is_global=False,
    ):
        self.type = artifact_type
        self.id = f"output-{self.type}-{utils._get_uuid()}"
        # path is used for local path
        self.path = path

        self.is_global = is_global

        if accesskey_secret is None or accesskey_id is None or bucket is None:
            raise SyntaxError(
                f"need to input the correct config for {self.type}")

        self.bucket = bucket

        if key is None:
            # assume the local path is the same as the path of OSS
            self.key = path
        else:
            self.key = key

        self.endpoint = endpoint

        secrets = {"accessKey": accesskey_id, "secretKey": accesskey_secret}
        # TODO: check this secret exist or not
        self.secret = couler.create_secret(secrets)
示例#3
0
    def test_create_secret(self):
        # First job with secret1
        user_info = {"uname": "abc", "passwd": "def"}
        secret1 = couler.create_secret(secret_data=user_info, name="dummy1")
        couler.run_container(
            image="python:3.6", secret=secret1, command="echo $uname"
        )

        # Second job with secret2 that exists
        access_key = ["access_key", "access_value"]
        secret2 = couler.obtain_secret(
            secret_keys=access_key, namespace="test", name="dummy2"
        )
        couler.run_container(
            image="python:3.6", secret=secret2, command="echo $access_value"
        )

        # Check the secret yaml
        self.assertEqual(len(couler.states._secrets), 2)
        secret1_yaml = couler.states._secrets[secret1].to_yaml()
        secret2_yaml = couler.states._secrets[secret2].to_yaml()

        self.assertEqual(secret1_yaml["metadata"]["name"], "dummy1")
        self.assertEqual(len(secret1_yaml["data"]), 2)
        self.assertEqual(
            secret1_yaml["data"]["uname"], utils.encode_base64("abc")
        )
        self.assertEqual(
            secret1_yaml["data"]["passwd"], utils.encode_base64("def")
        )

        self.assertEqual(secret2_yaml["metadata"]["namespace"], "test")
        self.assertEqual(secret2_yaml["metadata"]["name"], "dummy2")
        self.assertEqual(len(secret2_yaml["data"]), 2)
示例#4
0
    def __init__(
        self,
        path,
        accesskey_id,
        accesskey_secret,
        bucket,
        key=None,
        endpoint="http://oss-cn-hangzhou-zmf.aliyuncs.com",
        is_global=False,
    ):
        self.id = "output-oss-%s" % utils._get_uuid()
        # path is used for local path
        self.path = path
        self.type = "OSS"
        self.is_global = is_global

        if accesskey_secret is None or accesskey_id is None or bucket is None:
            raise SyntaxError("need to input the correct config for oss")

        self.bucket = bucket

        if key is None:
            # assume the local path is the same as the path of OSS
            self.key = path
        else:
            self.key = key

        self.endpoint = endpoint

        import couler.argo as couler

        secrets = {"accessKey": accesskey_id, "secretKey": accesskey_secret}
        # TODO: check this secret exist or not
        self.secret = couler.create_secret(secrets)
示例#5
0
    def __init__(
        self,
        artifact_type,
        path,
        accesskey_id=None,
        accesskey_secret=None,
        bucket=None,
        key=None,
        endpoint="",
        is_global=False,
        insecure=False,
    ):
        self.type = artifact_type
        self.id = f"output-{self.type}-{utils._get_uuid()}"
        # path is used for local path
        self.path = path
        self.is_global = is_global
        self.bucket = bucket
        self.key = key
        self.endpoint = endpoint
        self.insecure = insecure

        if accesskey_id and accesskey_secret:
            secret = {"accessKey": accesskey_id, "secretKey": accesskey_secret}
            # artifact_secret flag causes the secret to be created only when a secret with the same name doesn't exist in the namespace
            self.secret = couler.create_secret(secret, artifact_secret=True)
        else:
            self.secret = None
示例#6
0
    def test_tensorflow_train(self):
        access_key_secret = {"access_key": "key1234"}
        secret = couler.create_secret(secret_data=access_key_secret)

        tf.train(
            num_ps=2,
            num_workers=3,
            num_evaluators=1,
            image="tensorflow:1.13",
            command="python tf.py",
            no_chief=False,
            worker_resources="cpu=0.5,memory=1024",
            ps_restart_policy="Never",
            worker_restart_policy="OnFailure",
            evaluator_resources="cpu=2,memory=4096",
            clean_pod_policy="Running",
            secret=secret,
        )

        secret_yaml = list(couler.states._secrets.values())[0].to_yaml()
        self.assertEqual(secret_yaml["data"]["access_key"],
                         utils.encode_base64("key1234"))

        wf = couler.workflow_yaml()
        self.assertEqual(len(wf["spec"]["templates"]), 2)
        # Check steps template
        template0 = wf["spec"]["templates"][0]
        self.assertEqual(len(template0["steps"]), 1)
        self.assertEqual(len(template0["steps"][0]), 1)
        # Check train template
        template1 = wf["spec"]["templates"][1]
        self.assertEqual(template1["name"], "test-tensorflow-train")
        resource = template1["resource"]
        self.assertEqual(resource["action"], "create")
        self.assertEqual(resource["setOwnerReference"], "true")
        self.assertEqual(
            resource["successCondition"],
            "status.replicaStatuses.Worker.succeeded == 3",
        )
        self.assertEqual(
            resource["failureCondition"],
            "status.replicaStatuses.Worker.failed > 0",
        )
        # Check the tfjob spec
        tfjob = yaml.load(StringIO(resource["manifest"]),
                          Loader=yaml.FullLoader)
        self.assertEqual(tfjob["kind"], "TFJob")
        self.assertEqual(tfjob["spec"]["cleanPodPolicy"], "Running")

        chief = tfjob["spec"]["tfReplicaSpecs"]["Chief"]
        self.assertEqual(chief["replicas"], 1)
        chief_container = chief["template"]["spec"]["containers"][0]
        self.assertEqual(chief_container["env"][0]["name"], "access_key")
        self.assertEqual(
            chief_container["env"][0]["valueFrom"]["secretKeyRef"]["name"],
            secret_yaml["metadata"]["name"],
        )

        ps = tfjob["spec"]["tfReplicaSpecs"]["PS"]
        self.assertEqual(ps["replicas"], 2)
        self.assertEqual(ps["restartPolicy"], "Never")
        self.assertEqual(len(ps["template"]["spec"]["containers"]), 1)
        ps_container = ps["template"]["spec"]["containers"][0]
        self.assertEqual(ps_container["image"], "tensorflow:1.13")
        self.assertEqual(ps_container["command"], "python tf.py")

        worker = tfjob["spec"]["tfReplicaSpecs"]["Worker"]
        self.assertEqual(worker["replicas"], 3)
        self.assertEqual(worker["restartPolicy"], "OnFailure")
        self.assertEqual(len(worker["template"]["spec"]["containers"]), 1)
        worker_container = ps["template"]["spec"]["containers"][0]
        self.assertEqual(worker_container["image"], "tensorflow:1.13")
        self.assertEqual(worker_container["command"], "python tf.py")

        worker_container = worker["template"]["spec"]["containers"][0]
        self.assertEqual(worker_container["env"][0]["name"], "access_key")
        self.assertEqual(
            worker_container["env"][0]["valueFrom"]["secretKeyRef"]["name"],
            secret_yaml["metadata"]["name"],
        )
        self.assertEqual(worker_container["resources"]["limits"]["cpu"], 0.5)
        self.assertEqual(worker_container["resources"]["limits"]["memory"],
                         1024)

        evaluator = tfjob["spec"]["tfReplicaSpecs"]["Evaluator"]
        self.assertEqual(evaluator["replicas"], 1)
        self.assertEqual(len(evaluator["template"]["spec"]["containers"]), 1)
        evaluator_container = evaluator["template"]["spec"]["containers"][0]
        self.assertEqual(evaluator_container["image"], "tensorflow:1.13")
        self.assertEqual(evaluator_container["resources"]["limits"]["cpu"], 2)
        self.assertEqual(evaluator_container["resources"]["limits"]["memory"],
                         4096)
示例#7
0
 def job_2():
     user_info = {"uname": "abc", "passwd": "def"}
     secret1 = couler.create_secret(secret_data=user_info, dry_run=True)
     couler.run_container(image="python:3.6",
                          secret=secret1,
                          command="echo $uname")
示例#8
0
    def test_pytorch_train(self):
        access_key_secret = {"access_key": "key1234"}
        secret = couler.create_secret(secret_data=access_key_secret)

        pytorch.train(
            num_workers=3,
            image="pytorch:1.13",
            command="python pytorch.py",
            worker_resources="cpu=0.5,memory=1024",
            worker_restart_policy="OnFailure",
            clean_pod_policy="Running",
            secret=secret,
        )

        secret_yaml = list(couler.states._secrets.values())[0].to_yaml()
        self.assertEqual(secret_yaml["data"]["access_key"],
                         utils.encode_base64("key1234"))

        wf = couler.workflow_yaml()
        self.assertEqual(len(wf["spec"]["templates"]), 2)
        # Check steps template
        template0 = wf["spec"]["templates"][0]
        self.assertEqual(len(template0["steps"]), 1)
        self.assertEqual(len(template0["steps"][0]), 1)
        # Check train template
        template1 = wf["spec"]["templates"][1]
        self.assertEqual(template1["name"], "test-pytorch-train")
        resource = template1["resource"]
        self.assertEqual(resource["action"], "create")
        self.assertEqual(resource["setOwnerReference"], "true")
        self.assertEqual(
            resource["successCondition"],
            "status.pytorchReplicaStatuses.Worker.succeeded > 0",
        )
        self.assertEqual(
            resource["failureCondition"],
            "status.pytorchReplicaStatuses.Worker.failed > 0",
        )
        # Check the PyTorchJob spec
        pytorch_job = yaml.load(StringIO(resource["manifest"]),
                                Loader=yaml.FullLoader)
        self.assertEqual(pytorch_job["kind"], "PyTorchJob")
        self.assertEqual(pytorch_job["spec"]["cleanPodPolicy"], "Running")

        master = pytorch_job["spec"]["pytorchReplicaSpecs"]["Master"]
        self.assertEqual(master["replicas"], 1)
        chief_container = master["template"]["spec"]["containers"][0]
        self.assertEqual(chief_container["env"][0]["name"], "access_key")
        self.assertEqual(
            chief_container["env"][0]["valueFrom"]["secretKeyRef"]["name"],
            secret_yaml["metadata"]["name"],
        )

        worker = pytorch_job["spec"]["pytorchReplicaSpecs"]["Worker"]
        self.assertEqual(worker["replicas"], 3)
        self.assertEqual(worker["restartPolicy"], "OnFailure")
        self.assertEqual(len(worker["template"]["spec"]["containers"]), 1)
        worker_container = worker["template"]["spec"]["containers"][0]
        self.assertEqual(worker_container["image"], "pytorch:1.13")
        self.assertEqual(worker_container["command"], "python pytorch.py")

        worker_container = worker["template"]["spec"]["containers"][0]
        self.assertEqual(worker_container["env"][0]["name"], "access_key")
        self.assertEqual(
            worker_container["env"][0]["valueFrom"]["secretKeyRef"]["name"],
            secret_yaml["metadata"]["name"],
        )
        self.assertEqual(worker_container["resources"]["limits"]["cpu"], 0.5)
        self.assertEqual(worker_container["resources"]["limits"]["memory"],
                         1024)