示例#1
0
def test_secrets():
    client = python_pachyderm.Client()
    secret_name = util.test_repo_name("test-secrets")

    client.create_secret(
        secret_name,
        {
            "mykey": "my-value",
        },
    )

    secret = client.inspect_secret(secret_name)
    assert secret.secret.name == secret_name

    secrets = client.list_secret()
    assert len(secrets) == 1
    assert secrets[0].secret.name == secret_name

    client.delete_secret(secret_name)

    with pytest.raises(python_pachyderm.RpcError):
        client.inspect_secret(secret_name)

    secrets = client.list_secret()
    assert len(secrets) == 0
示例#2
0
def test_spout_commit():
    client = python_pachyderm.Client()
    client.delete_all()

    client.create_pipeline(
        pipeline_name="pipeline-spout-commit",
        transform=pps_proto.Transform(
            cmd=["bash"],
            stdin=[
                "echo 'commit time' >> file.txt",
                "pachctl put file pipeline-spout-commit@master:/file.txt -f file.txt",
            ],
        ),
        spout=pps_proto.Spout(),
    )

    c = client.subscribe_commit(
        repo_name="pipeline-spout-commit",
        branch="master",
        state=pfs_proto.FINISHED,
        origin_kind=pfs_proto.USER,
    )
    next(c)

    commit_infos = list(client.list_commit("pipeline-spout-commit"))
    assert len(commit_infos) == 1
示例#3
0
def test_put_files():
    client = python_pachyderm.Client()
    repo_name = util.create_test_repo(client, "put_files")

    with tempfile.TemporaryDirectory(suffix="python_pachyderm") as d:
        # create a temporary directory with these files:
        # 0.txt  1.txt  2.txt  3.txt  4.txt  0/0.txt  1/1.txt  2/2.txt
        # 3/3.txt  4/4.txt
        for i in range(5):
            os.makedirs(os.path.join(d, str(i)))

        for j in range(5):
            with open(os.path.join(d, "{}.txt".format(j)), "w") as f:
                f.write(str(j))
            with open(os.path.join(d, str(j), "{}.txt".format(j)), "w") as f:
                f.write(str(j))

        # add the files under both `/` and `/sub` (the latter redundantly to
        # test both for correct path handling and the ability to put files
        # that already exist)
        commit = "{}/master".format(repo_name)
        python_pachyderm.put_files(client, d, commit, "/")
        python_pachyderm.put_files(client, d, commit, "/sub")
        python_pachyderm.put_files(client, d, commit, "/sub/")

    expected = set(["/", "/sub"])
    for i in range(5):
        expected.add("/{}".format(i))
        expected.add("/{}.txt".format(i))
        expected.add("/{}/{}.txt".format(i, i))
        expected.add("/sub/{}".format(i))
        expected.add("/sub/{}.txt".format(i))
        expected.add("/sub/{}/{}.txt".format(i, i))

    check_expected_files(client, commit, expected)
示例#4
0
def test_delete_all_transactions():
    client = python_pachyderm.Client()
    client.start_transaction()
    client.start_transaction()
    assert len(client.list_transaction()) == 2
    client.delete_all_transactions()
    assert len(client.list_transaction()) == 0
示例#5
0
    def __init__(self,
                 commit,
                 path_prefix="/",
                 pachy_host=os.environ['PACHYDERM_HOST_URI'],
                 pachy_port="30650",
                 local_root='/data',
                 transform=T.Compose([
                     T.Resize((256, 256)),
                     T.ToTensor(),
                     T.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
                 ])):
        self.commit = commit
        self.path_prefix = path_prefix
        self.local_root = local_root

        self.client = python_pachyderm.Client(host=pachy_host, port=pachy_port)

        self.meta_path_lst = [{
            'path': res.file.path,
            'size': res.size_bytes
        } for res in self.client.glob_file(commit, path_prefix + "meta.json")]

        self._download_data_from_pachyderm(self.meta_path_lst,
                                           self.path_prefix + "meta.json")

        with open(os.path.join(self.local_root, "meta.json")) as meta_f:
            meta = json.load(meta_f)
        self.meta = meta
        self.class_names = self.meta["class_names"]
        self.num_classes = len(self.class_names)
        self.transform = transform
示例#6
0
def test_create_pipeline_from_request():
    client = python_pachyderm.Client()

    repo_name = util.create_test_repo(client, "test_create_pipeline_from_request")
    pipeline_name = util.test_repo_name("test_create_pipeline_from_request")

    # more or less a copy of the opencv demo's edges pipeline spec
    client.create_pipeline_from_request(
        pps_proto.CreatePipelineRequest(
            pipeline=pps_proto.Pipeline(name=pipeline_name),
            description="A pipeline that performs image edge detection by using the OpenCV library.",
            input=pps_proto.Input(
                pfs=pps_proto.PFSInput(
                    glob="/*",
                    repo=repo_name,
                ),
            ),
            transform=pps_proto.Transform(
                cmd=["echo", "hi"],
                image="pachyderm/opencv",
            ),
        )
    )

    assert any(p.pipeline.name == pipeline_name for p in list(client.list_pipeline()))
示例#7
0
def load():
    output = {}
    output['client'] = Algorithmia.client()

    output['pach_client'] = python_pachyderm.Client(
        host=os.environ["PACH_HOST"],
        port=os.environ["PACH_PORT"],
        auth_token=os.environ["PACH_AUTH"],
        tls=True)

    # Download the model and config
    Path("/tmp/trained_model").mkdir(parents=True, exist_ok=True)
    with open("/tmp/trained_model/config.json", "wb") as f:
        f.write(output['pach_client'].get_file(("train_model", MODEL_VERSION),
                                               "config.json").read())

    with open("/tmp/trained_model/pytorch_model.bin", "wb") as f:
        f.write(output['pach_client'].get_file(("train_model", MODEL_VERSION),
                                               "pytorch_model.bin").read())

    output['classification_model_path'] = Path("/tmp/trained_model/")
    output[
        'classification_model'] = AutoModelForSequenceClassification.from_pretrained(
            output['classification_model_path'], cache_dir=None, num_labels=3)

    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')

    return output
示例#8
0
def download_pach_repo(pachyderm_host, pachyderm_port, repo, branch, root):
    client = python_pachyderm.Client(host=pachyderm_host, port=pachyderm_port)
    files = []
    if not os.path.exists(root):
        os.makedirs(root)
    for file in client.walk_file((repo, branch), "/"):
        files.append(file)

    args = []
    count = 0
    fpaths = []
    for i in range(len(files)):
        path = files[i].file.path
        fpath = os.path.join(root, path[1:])
        if files[i].file_type == 2:
            os.makedirs(fpath, exist_ok=True)
        else:
            fpaths.append((path, fpath))
    for path, fpath in fpaths:
        contents = client.get_file((repo, branch), path)
        f = open(fpath, 'wb')
        for bytes in contents:
            f.write(bytes)
        f.close()
        if fpath.endswith('.tar.gz'):
            tarfile.open(fpath).extractall(path=root)
示例#9
0
def main():
    # Connects to a pachyderm cluster on the default host:port
    # (`localhost:30650`). This will work for certain environments (e.g. k8s
    # running on docker for mac), as well as when port forwarding is being
    # used. For other setups, you'll want one of the alternatives:
    # 1) To connect to pachyderm when this script is running inside the
    #    cluster, use `python_pachyderm.Client.new_in_cluster()`.
    # 2) To connect to pachyderm via a pachd address, use
    #    `python_pachyderm.Client.new_from_pachd_address`.
    # 3) To explicitly set the host and port, pass parameters into
    #   `python_pachyderm.Client()`.
    client = python_pachyderm.Client()

    # Create a repo called images
    client.create_repo("images")

    # Create a pipeline specifically designed for executing python code. This
    # is equivalent to the edges pipeline in the standard opencv example.
    python_pachyderm.create_python_pipeline(
        client,
        relpath("edges"),
        input=python_pachyderm.Input(
            pfs=python_pachyderm.PFSInput(glob="/*", repo="images")),
    )

    # Create the montage pipeline
    client.create_pipeline(
        "montage",
        transform=python_pachyderm.Transform(
            cmd=["sh"],
            image="v4tech/imagemagick",
            stdin=[
                "montage -shadow -background SkyBlue -geometry 300x300+2+2 $(find /pfs -type f | sort) /pfs/out/montage.png"
            ],
        ),
        input=python_pachyderm.Input(cross=[
            python_pachyderm.Input(
                pfs=python_pachyderm.PFSInput(glob="/", repo="images")),
            python_pachyderm.Input(
                pfs=python_pachyderm.PFSInput(glob="/", repo="edges")),
        ]),
    )

    with client.commit("images", "master") as commit:
        # Add some images, recursively inserting content from the images
        # directory. Alternatively, you could use `client.put_file_url` or
        # `client_put_file_bytes`.
        python_pachyderm.put_files(client, relpath("images"), commit, "/")

    # Wait for the commit (and its downstream commits) to finish
    for _ in client.flush_commit([commit]):
        pass

    # Get the montage
    source_file = client.get_file("montage/master", "/montage.png")
    with tempfile.NamedTemporaryFile(suffix="montage.png",
                                     delete=False) as dest_file:
        shutil.copyfileobj(source_file, dest_file)
        print("montage written to {}".format(dest_file.name))
示例#10
0
 def __init__(self, test_name):
     client = python_pachyderm.Client()
     commit, input_repo_name, pipeline_repo_name = util.create_test_pipeline(
         client, test_name)
     self.client = client
     self.commit = commit
     self.input_repo_name = input_repo_name
     self.pipeline_repo_name = pipeline_repo_name
def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument('--host',
                        required=True,
                        help='Only the hostname of a grpc URL.')
    parser.add_argument('--port', required=True, help='The port number.')
    parser.add_argument('--specification',
                        required=True,
                        help='A DAG end node pipeline specification path.')
    parser.add_argument('--specifications',
                        required=True,
                        help='A path containing pipeline specification files.')
    args = parser.parse_args()
    host = args.host
    port = int(args.port)
    specification = Path(args.specification)
    specifications = Path(args.specifications)

    print(f'host: {host}')
    print(f'port: {port}')
    print(f'specification: {specification}')
    print(f'specifications: {specifications}')

    client = python_pachyderm.Client(host=host, port=port)

    parser = PipelineSpecificationParser(specification, specifications)
    dag_manager = DagManager(parser)
    dag_builder = dag_manager.get_dag_builder()
    pipeline_names = dag_builder.get_pipeline_names()

    total_upload = 0
    total_download = 0
    total_process = 0
    for pipeline_name in pipeline_names:
        job = data_finder.get_latest_job(client, pipeline_name)
        if job is None:
            print(f'No jobs are available for {pipeline_name}')
        else:
            job_data = data_finder.get_job_run_times(job)
            upload_time = job_data.get('upload')
            download_time = job_data.get('download')
            process_time = job_data.get('process')
            datums_processed = job_data.get('datums_processed')
            print(f'pipeline: {pipeline_name} '
                  f'upload time: {upload_time} sec. '
                  f'download time: {download_time} sec. '
                  f'process time {process_time} sec. '
                  f'datums processed {datums_processed}')
            if upload_time is not None:
                total_upload += upload_time
            if download_time is not None:
                total_download += download_time
            if process_time is not None:
                total_process += process_time
    print(f'total upload: {total_upload} sec. '
          f'total download: {total_download} sec. '
          f'total_process: {total_process} sec. ')
示例#12
0
def test_delete_all_repos():
    client = python_pachyderm.Client()

    util.create_test_repo(client, "test_delete_all_repos", prefix="extra-1")
    util.create_test_repo(client, "test_delete_all_repos", prefix="extra-2")
    assert len(list(client.list_repo())) >= 2

    client.delete_all_repos()
    assert len(list(client.list_repo())) == 0
示例#13
0
def test_transaction_context_mgr_nested():
    client = python_pachyderm.Client()

    with client.transaction():
        assert client.transaction_id is not None
        old_transaction_id = client.transaction_id

        with client.transaction():
            assert client.transaction_id is not None
            assert client.transaction_id != old_transaction_id

        assert client.transaction_id == old_transaction_id
示例#14
0
def test_put_files_single_file():
    client = python_pachyderm.Client()
    client.delete_all()
    repo_name = util.create_test_repo(client, "put_files_single_file")

    with tempfile.NamedTemporaryFile() as f:
        f.write(b"abcd")
        f.flush()
        commit = (repo_name, "master")
        python_pachyderm.put_files(client, f.name, commit, "/f1.txt")
        python_pachyderm.put_files(client, f.name, commit, "/f/f1")

    expected = set(["/", "/f1.txt", "/f/", "/f/f1"])
    check_expected_files(client, commit, expected)
示例#15
0
def test_create_python_pipeline_bad_path():
    client = python_pachyderm.Client()
    repo_name = util.create_test_repo(client, "create_python_pipeline_bad_path")

    # create some sample data
    with client.commit(repo_name, "master") as commit:
        client.put_file_bytes(commit, 'file.dat', b'DATA')

    # create a pipeline from a file that does not exist - should fail
    with pytest.raises(Exception):
        python_pachyderm.create_python_pipeline(
            client, "./foobar2000",
            input=python_pachyderm.Input(pfs=python_pachyderm.PFSInput(glob="/", repo=repo_name)),
        )
示例#16
0
def test_pachyderm_version():
    global _test_pachyderm_version

    if _test_pachyderm_version is None:
        value = os.environ.get("PACHYDERM_VERSION")

        if value is None:
            client = python_pachyderm.Client()
            value = client.get_remote_version()
            _test_pachyderm_version = (value.major, value.minor, value.micro)
        else:
            _test_pachyderm_version = tuple(int(i) for i in value.split("."))

    return _test_pachyderm_version
示例#17
0
def test_transaction_context_mgr_exception():
    client = python_pachyderm.Client()
    expected_repo_count = len(list(client.list_repo()))

    with pytest.raises(Exception):
        with client.transaction():
            util.create_test_repo(client,
                                  "test_transaction_context_mgr_exception")
            util.create_test_repo(client,
                                  "test_transaction_context_mgr_exception")
            raise Exception("oops!")

    assert len(client.list_transaction()) == 0
    assert len(list(client.list_repo())) == expected_repo_count
示例#18
0
 def download_data(self) -> str:
     data_config = self.context.get_data_config()
     data_dir = os.path.join(self.download_directory, 'data')
     pachyderm_host = data_config['pachyderm']['host']
     pachyderm_port = data_config['pachyderm']['port']
     pach_client = python_pachyderm.Client(host=pachyderm_host, port=pachyderm_port)
     download_pach_repo(
         pachyderm_host,
         pachyderm_port,
         data_config["pachyderm"]["repo"],
         data_config["pachyderm"]["branch"],
         data_dir,
     )
     return data_dir
示例#19
0
def test_create_spout():
    client = python_pachyderm.Client()
    client.delete_all()

    client.create_pipeline(
        pipeline_name="pipeline-create-spout",
        transform=pps_proto.Transform(
            cmd=["sh"],
            image="alpine",
        ),
        spout=pps_proto.Spout(),
    )

    assert len(list(client.list_pipeline())) == 1
示例#20
0
def client():
    pc = python_pachyderm.Client()
    pc.activate_license(os.environ["PACH_PYTHON_ENTERPRISE_CODE"])
    pc.add_cluster("localhost", "localhost:1650", secret="secret")
    pc.activate_enterprise("localhost:1650", "localhost", "secret")

    pc.auth_token = "iamroot"
    pc.activate_auth(pc.auth_token)
    pc.set_identity_server_config(config=identity_proto.IdentityServerConfig(
        issuer="http://localhost:1658"))
    yield pc
    # not redundant because auth_token could be overriden by tests
    pc.auth_token = "iamroot"
    pc.delete_all()
    pc.deactivate_enterprise()
示例#21
0
def test_create_pipeline():
    client = python_pachyderm.Client()
    client.delete_all()

    input_repo_name = util.create_test_repo(client, "input_repo_test_create_pipeline")

    client.create_pipeline(
        "pipeline_test_create_pipeline",
        transform=pps_proto.Transform(
            cmd=["sh"],
            image="alpine",
            stdin=["cp /pfs/{}/*.dat /pfs/out/".format(input_repo_name)],
        ),
        input=pps_proto.Input(pfs=pps_proto.PFSInput(glob="/*", repo=input_repo_name)),
    )
    assert len(list(client.list_pipeline())) == 1
def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument('--host',
                        required=True,
                        help='Only the hostname part of a grpc URL.')
    parser.add_argument('--port', required=True, help='The port number.')
    parser.add_argument('--pipeline', required=True, help='A pipeline name.')
    args = parser.parse_args()
    host = args.host
    port = int(args.port)
    pipeline_name = args.pipeline
    client = python_pachyderm.Client(host=host, port=port)
    job = get_latest_job(client, pipeline_name)
    if job is None:
        print(f'No jobs are available for {pipeline_name}.')
    else:
        get_job_run_times(job)
示例#23
0
def test_list_commit():
    python_pachyderm.Client().delete_all_repos()

    client, repo_name1 = sandbox("list_commit1")

    with client.commit(repo_name1, "master"):
        pass
    with client.commit(repo_name1, "master"):
        pass

    repo_name2 = util.create_test_repo(client, "list_commit2")

    with client.commit(repo_name2, "master"):
        pass

    commits = list(client.list_commit())
    assert len(commits) == 3
示例#24
0
def test_delete_transaction():
    client = python_pachyderm.Client()
    expected_repo_count = len(list(client.list_repo()))

    transaction = client.start_transaction()
    util.create_test_repo(client, "test_delete_transaction")
    util.create_test_repo(client, "test_delete_transaction")
    client.delete_transaction(transaction)

    assert len(client.list_transaction()) == 0
    # even though the transaction was deleted, the repos were still created,
    # because the transaction wasn't tied to the client
    assert len(list(client.list_repo())) == expected_repo_count + 2

    with pytest.raises(python_pachyderm.RpcError):
        # re-deleting should cause an error
        client.delete_transaction(transaction)
示例#25
0
def test_batch_transaction():
    client = python_pachyderm.Client()
    expected_repo_count = len(list(client.list_repo())) + 3

    def create_repo_request():
        return transaction_pb2.TransactionRequest(
            create_repo=pfs_pb2.CreateRepoRequest(repo=pfs_pb2.Repo(
                name=util.test_repo_name("test_batch_transaction"))))

    client.batch_transaction([
        create_repo_request(),
        create_repo_request(),
        create_repo_request(),
    ])

    assert len(client.list_transaction()) == 0
    assert len(list(client.list_repo())) == expected_repo_count
示例#26
0
def test_transaction_context_mgr():
    client = python_pachyderm.Client()
    expected_repo_count = len(list(client.list_repo())) + 2

    with client.transaction() as transaction:
        util.create_test_repo(client, "test_transaction_context_mgr")
        util.create_test_repo(client, "test_transaction_context_mgr")

        transactions = client.list_transaction()
        assert len(transactions) == 1
        assert transactions[0].transaction.id == transaction.id
        assert client.inspect_transaction(
            transaction).transaction.id == transaction.id
        assert (client.inspect_transaction(
            transaction.id).transaction.id == transaction.id)

    assert len(client.list_transaction()) == 0
    assert len(list(client.list_repo())) == expected_repo_count
示例#27
0
def main():
    client = python_pachyderm.Client()

    client.create_pipeline(
        pipeline_name="spout",
        transform=pps_proto.Transform(
            cmd=["python3", "consumer/main.py"],
            image="pachyderm/example-spout101:2.0.0-beta.5",
        ),
        spout=pps_proto.Spout(),
        description=
        "A spout pipeline that emulates the reception of data from an external source",
    )

    client.create_pipeline(
        pipeline_name="processor",
        transform=pps_proto.Transform(
            cmd=["python3", "processor/main.py"],
            image="pachyderm/example-spout101:2.0.0-beta.5",
        ),
        input=pps_proto.Input(
            pfs=pps_proto.PFSInput(repo="spout", branch="master", glob="/*")),
        description="A pipeline that sorts 1KB vs 2KB files",
    )

    client.create_pipeline(
        pipeline_name="reducer",
        transform=pps_proto.Transform(
            cmd=["bash"],
            stdin=[
                "set -x",
                "FILES=/pfs/processor/*/*",
                "for f in $FILES",
                "do",
                "directory=`dirname $f`",
                "out=`basename $directory`",
                "cat $f >> /pfs/out/${out}.txt",
                "done",
            ],
        ),
        input=pps_proto.Input(pfs=pps_proto.PFSInput(
            repo="processor", branch="master", glob="/*")),
        description="A pipeline that reduces 1K/ and 2K/ directories",
    )
示例#28
0
def test_enterprise():
    client = python_pachyderm.Client()
    client.delete_all_license()

    client.activate_license(os.environ["PACH_PYTHON_ENTERPRISE_CODE"])
    client.add_cluster("localhost", "localhost:1650", secret="secret")
    client.update_cluster("localhost", "localhost:1650", "localhost:16650")
    client.activate_enterprise("localhost:1650", "localhost", "secret")

    assert len(client.list_clusters()) == len(client.list_user_clusters())
    assert client.get_enterprise_state().state == enterprise_proto.State.ACTIVE
    assert (
        client.get_activation_code().activation_code
        == os.environ["PACH_PYTHON_ENTERPRISE_CODE"]
    )

    client.delete_cluster("localhost")
    client.deactivate_enterprise()
    client.delete_all_license()
示例#29
0
def sandbox():
    client = python_pachyderm.Client()
    client.activate_enterprise(os.environ["PACH_PYTHON_ENTERPRISE_CODE"])
    root_auth_token = None

    try:
        root_auth_token = client.activate_auth("robot:root")
        client.auth_token = root_auth_token
        try:
            yield client
        finally:
            try:
                client.deactivate_auth()
                client.auth_token = None
            except:
                print(
                    "an exception occurred trying to deactivate auth, please manually disable auth with the root auth token: {}"
                    .format(root_auth_token))
                raise
    finally:
        client.deactivate_enterprise()
示例#30
0
def main():
    client = python_pachyderm.Client()

    client.create_pipeline(
        pipeline_name="producer",
        transform=python_pachyderm.Transform(
            cmd=["python3", "/app/main.py"],
            image="ysimonson/pachyderm_spout_producer",
        ),
        spout=python_pachyderm.Spout(
            overwrite=False,
            marker="marker",
        ),
    )

    python_pachyderm.create_python_pipeline(
        client,
        relpath("consumer"),
        input=python_pachyderm.Input(
            pfs=python_pachyderm.PFSInput(glob="/", repo="producer")),
    )