def lookup_repository(name: str, include_local: bool = False) -> "Repository": """ Queries the SG engines on the lookup path to locate one hosting the given repository. :param name: Repository name :param include_local: If True, also queries the local engine :return: Local or remote Repository object """ from splitgraph.core.repository import Repository template = Repository.from_schema(name) if name in _LOOKUP_PATH_OVERRIDE: return Repository( template.namespace, template.repository, get_engine(_LOOKUP_PATH_OVERRIDE[name]) ) # Currently just check if the schema with that name exists on the remote. if include_local and repository_exists(template): return template for engine in _LOOKUP_PATH: candidate = Repository(template.namespace, template.repository, get_engine(engine)) if repository_exists(candidate): return candidate candidate.engine.close() raise RepositoryNotFoundError("Unknown repository %s!" % name)
def dependents_c(image_spec, source_on, dependents_on): """ List images that were created from an image. This is the inverse of the sgr provenance command. It will list all images that were created using a Splitfile that imported data from this image. By default, this will look at images on the local engine. The engine can be overridden with --source-on and --dependents-on. For example: sgr dependents --source-on data.splitgraph.com --dependents-on LOCAL noaa/climate:latest will show all images on the local engine that derived data from `noaa/climate:latest` on the Splitgraph registry. """ from splitgraph.engine import get_engine from splitgraph.core.repository import Repository source_engine = get_engine(source_on) if source_on else get_engine() repository, image = image_spec repository = Repository.from_template(repository, engine=source_engine) image = repository.images[image] target_engine = get_engine( dependents_on) if dependents_on else get_engine() result = image.provenance(reverse=True, engine=target_engine) click.echo("%s:%s is depended on by:" % (str(repository), image.image_hash)) click.echo("\n".join("%s:%s" % rs for rs in result))
def healthcheck_mounting(): # A pre-flight check for heavier tests that also ensures the three origin databases that we mount for FDW tests # are up. Tests that require one of these databases to be up are marked with @pytest.mark.mounting and can be # excluded with `poetry run pytest -m "not mounting"`. for mountpoint in [PG_MNT, MG_MNT, MYSQL_MNT]: mountpoint.delete() _mount_postgres(PG_MNT) _mount_mongo(MG_MNT) _mount_mysql(MYSQL_MNT) try: assert ( get_engine().run_sql( 'SELECT COUNT(*) FROM "test/pg_mount".fruits', return_shape=ResultShape.ONE_ONE ) is not None ) assert ( get_engine().run_sql( 'SELECT COUNT(*) FROM "test_mg_mount".stuff', return_shape=ResultShape.ONE_ONE ) is not None ) assert ( get_engine().run_sql( 'SELECT COUNT(*) FROM "test/mysql_mount".mushrooms', return_shape=ResultShape.ONE_ONE, ) is not None ) finally: for mountpoint in [PG_MNT, MG_MNT, MYSQL_MNT]: mountpoint.delete()
def _execute_custom(node: Node, output: Repository) -> ProvenanceLine: assert output.head is not None command, args = parse_custom_command(node) # Locate the command in the config file and instantiate it. cmd_fq_class: str = cast( str, get_all_in_section(CONFIG, "commands").get(command)) if not cmd_fq_class: raise SplitfileError( "Custom command {0} not found in the config! Make sure you add an entry to your" " config like so:\n [commands] \n{0}=path.to.command.Class". format(command)) assert isinstance(cmd_fq_class, str) index = cmd_fq_class.rindex(".") try: cmd_class = getattr(import_module(cmd_fq_class[:index]), cmd_fq_class[index + 1:]) except AttributeError as e: raise SplitfileError( "Error loading custom command {0}".format(command)) from e except ImportError as e: raise SplitfileError( "Error loading custom command {0}".format(command)) from e get_engine().run_sql("SET search_path TO %s", (output.to_schema(), )) command = cmd_class() # Pre-flight check: get the new command hash and see if we can short-circuit and just check the image out. command_hash = command.calc_hash(repository=output, args=args) output_head = output.head.image_hash if command_hash is not None: image_hash = _combine_hashes([output_head, command_hash]) try: output.images.by_hash(image_hash).checkout() logging.info(" ---> Using cache") return {"type": "CUSTOM"} except ImageNotFoundError: pass logging.info(" Executing custom command...") exec_hash = command.execute(repository=output, args=args) command_hash = command_hash or exec_hash or "{:064x}".format( getrandbits(256)) image_hash = _combine_hashes([output_head, command_hash]) logging.info(" ---> %s" % image_hash[:12]) # Check just in case if the new hash produced by the command already exists. try: output.images.by_hash(image_hash).checkout() except ImageNotFoundError: # Full command as a commit comment output.commit(image_hash, comment=node.text) return {"type": "CUSTOM"}
def version_engine_c(name): """Get version of Splitgraph engine.""" from splitgraph.engine import get_engine if name == DEFAULT_ENGINE: engine = get_engine() else: engine = get_engine(name) version = engine.splitgraph_version if version: click.echo("Splitgraph Engine %s" % version)
def test_mount_force_schema(local_engine_empty): _mount_postgres( PG_MNT, tables={"fruits": { "schema": { "fruit_id": "character varying" } }}) assert get_engine().table_exists(PG_MNT.to_schema(), "fruits") assert get_engine().get_full_table_schema( PG_MNT.to_schema(), "fruits") == [ TableColumn(1, "fruit_id", "character varying", False, None) ]
def test_commandline_lq_checkout(pg_repo_local): runner = CliRunner() # Uncheckout first result = runner.invoke(checkout_c, [str(pg_repo_local), "-u", "-f"]) assert result.exit_code == 0 assert pg_repo_local.head is None assert not get_engine().schema_exists(str(pg_repo_local)) result = runner.invoke(checkout_c, [str(pg_repo_local) + ":latest", "-l"]) assert result.exit_code == 0 assert pg_repo_local.head is not None assert get_engine().schema_exists(str(pg_repo_local)) assert get_engine().get_table_type(str(pg_repo_local), "fruits") in ("FOREIGN TABLE", "FOREIGN")
def invoke(self, ctx): from splitgraph.engine import get_engine engine = get_engine() try: result = super(click.Group, self).invoke(ctx) engine.commit() return result except Exception as exc: engine.rollback() if isinstance( exc, (click.exceptions.ClickException, click.exceptions.Abort, click.exceptions.Exit), ): raise # Can't seem to be able to get the click_log verbosity option # value so have to get it indirectly. Basically, if we're in # DEBUG mode, output the whole stacktrace. if logger.getEffectiveLevel() == logging.DEBUG: logger.error(traceback.format_exc()) else: logger.error("%s: %s" % (get_exception_name(exc), exc)) ctx.exit(code=2) finally: _do_version_check() engine.close()
def invoke(self, ctx): from splitgraph.engine import get_engine import psycopg2.extensions import psycopg2.extras # Allow users to send SIGINT to quickly terminate sgr (instead of waiting for a PG # statement to finish) psycopg2.extensions.set_wait_callback(psycopg2.extras.wait_select) engine = get_engine() try: result = super(click.Group, self).invoke(ctx) engine.commit() return result except Exception as exc: engine.rollback() if isinstance( exc, (click.exceptions.ClickException, click.exceptions.Abort, click.exceptions.Exit), ): raise # Can't seem to be able to get the click_log verbosity option # value so have to get it indirectly. Basically, if we're in # DEBUG mode, output the whole stacktrace. if logger.getEffectiveLevel() == logging.DEBUG: logger.error(traceback.format_exc()) else: logger.error("%s: %s" % (get_exception_name(exc), exc)) ctx.exit(code=2) finally: _do_version_check() engine.close()
def clone_c(remote_repository_or_image, local_repository, remote, download_all, overwrite_object_meta, tags): """ Clone a remote Splitgraph repository/image into a local one. The lookup path for the repository is governed by the ``SG_REPO_LOOKUP`` and ``SG_REPO_LOOKUP_OVERRIDE`` config parameters and can be overridden by the command line ``--remote`` option. """ from splitgraph.core.repository import Repository from splitgraph.engine import get_engine from splitgraph.core.repository import clone remote_repository, image = remote_repository_or_image # If the user passed in a remote, we can inject that into the repository spec. # Otherwise, we have to turn the repository into a string and let clone() look up the # actual engine the repository lives on. if remote: remote_repository = Repository.from_template(remote_repository, engine=get_engine(remote)) else: remote_repository = remote_repository.to_schema() clone( remote_repository, local_repository=local_repository, download_all=download_all, single_image=image, overwrite_objects=overwrite_object_meta, overwrite_tags=tags, )
def status_c(repository): """ Show the status of the Splitgraph engine. If a repository is passed, show in-depth information about a repository. If not, show information about all repositories local to the engine. This will show a list of all repositories, number of local images and tags, total repository size (theoretical maximum size and current on-disk footprint of cached objects) and the current checked out image (with LQ if the image is checked out using read-only layered querying). """ from splitgraph.core.engine import get_current_repositories from splitgraph.engine import get_engine if repository is None: engine = get_engine() repositories = get_current_repositories(engine) _emit_repository_data(repositories, engine) else: head = repository.head if not head: click.echo("%s: nothing checked out." % str(repository)) return parent, children = head.get_parent_children() click.echo("%s: on image %s." % (str(repository), head.image_hash)) if parent is not None: click.echo("Parent: %s" % parent) if len(children) > 1: click.echo("Children: " + "\n".join(children)) elif len(children) == 1: click.echo("Child: %s" % children[0])
def test_init_new_db(): try: get_engine().delete_database("testdb") # CliRunner doesn't run in a brand new process and by that point PG_DB has propagated # through a few modules that are difficult to patch out, so let's just shell out. output = subprocess.check_output( "SG_LOGLEVEL=INFO SG_ENGINE_DB_NAME=testdb sgr init", shell=True, stderr=subprocess.STDOUT, ) output = output.decode("utf-8") assert "Creating database testdb" in output assert "Installing the audit trigger" in output finally: get_engine().delete_database("testdb")
def test_local_engine(): # A local Splitgraph engine fixture. logging.info("Initializing the test local Splitgraph engine...") engine = get_engine() engine.initialize() # Copy the config file over into the test engine container, since some of the layered querying # tests/object downloading tests get the engine to connect to the registry. # It's still an open question regarding how we should do it properly (config file changes do have # to be reflected in the engine as well). However, doing this instead of the bind mount lets # us switch the config used in test by switching the envvar. config_path = CONFIG["SG_CONFIG_FILE"] client = docker.from_env() try: container = client.containers.get(SPLITGRAPH_ENGINE_CONTAINER) logging.info("Copying .sgconfig (%s) to container %s", config_path, container.short_id) copy_to_container(container, config_path, "/.sgconfig") except docker.errors.NotFound: logging.exception( "Could not find the engine test container %s, is it running?", SPLITGRAPH_ENGINE_CONTAINER, ) engine.commit() logging.info("Test local Splitgraph engine initialized.") return engine
def mount( mountpoint: str, mount_handler: str, handler_kwargs: Dict[str, Union[str, int, None, List[str], Dict[str, Union[str, Dict[str, str]]]]], ) -> None: """ Mounts a foreign database via Postgres FDW (without creating new Splitgraph objects) :param mountpoint: Mountpoint to import the new tables into. :param mount_handler: The type of the mounted database. Must be one of `postgres_fdw` or `mongo_fdw`. :param handler_kwargs: Dictionary of options to pass to the mount handler. """ from splitgraph.engine import get_engine from psycopg2.sql import Identifier, SQL engine = get_engine() mh_func = get_mount_handler(mount_handler) engine.run_sql( SQL("DROP SCHEMA IF EXISTS {} CASCADE").format(Identifier(mountpoint))) engine.run_sql( SQL("DROP SERVER IF EXISTS {} CASCADE").format( Identifier(mountpoint + "_server"))) mh_func(mountpoint, **handler_kwargs) engine.commit()
def wrapped(*args, **kwargs): from splitgraph.engine import get_engine, set_engine engine = get_engine() try: f(*args, **kwargs) engine.commit() except Exception: engine.rollback() raise finally: engine.close() # In the context of a test run, we need to switch the global engine # back to LOCAL (since the engine-switching decorator doesn't # get control, so we can't do it there). set_engine(get_engine("LOCAL"))
def mount_mysql( mountpoint: str, server: str, port: int, username: str, password: str, remote_schema: str, tables: Optional[Union[List[str], Dict[str, Dict[str, str]]]] = None, ) -> None: """ Mount a MySQL database. Mounts a schema on a remote MySQL database as a set of foreign tables locally. \b :param mountpoint: Schema to mount the remote into. :param server: Database hostname. :param port: Database port :param username: A read-only user that the database will be accessed as. :param password: Password for the read-only user. :param remote_schema: Remote schema name. :param tables: Tables to mount (default all). If a list, then will use IMPORT FOREIGN SCHEMA. If a dictionary, must have the format `{"table_name": {"col_1": "type_1", ...}}`. """ from splitgraph.engine import get_engine from psycopg2.sql import Identifier, SQL if tables is None: tables = [] engine = get_engine() logging.info("Mounting foreign MySQL database...") server_id = mountpoint + "_server" init_fdw( engine, server_id, "mysql_fdw", { "host": server, "port": str(port) }, { "username": username, "password": password }, ) engine.run_sql( SQL("CREATE SCHEMA IF NOT EXISTS {}").format(Identifier(mountpoint))) if isinstance(tables, list): _import_foreign_schema(engine, mountpoint, remote_schema, server_id, tables) else: _create_foreign_tables(engine, server_id, mountpoint, tables, server_options={"dbname": remote_schema})
def mount_mongo(mountpoint: str, server: str, port: int, username: str, password: str, **table_spec) -> None: """ Mount a Mongo database. Mounts one or more collections on a remote Mongo database as a set of foreign tables locally. \b :param mountpoint: Schema to mount the remote into. :param server: Database hostname. :param port: Port the Mongo server is running on. :param username: A read-only user that the database will be accessed as. :param password: Password for the read-only user. :param table_spec: A dictionary of form `{"table_name": {"db": <dbname>, "coll": <collection>, "schema": {"col1": "type1"...}}}`. """ from splitgraph.engine import get_engine from psycopg2.sql import Identifier, SQL engine = get_engine() server_id = mountpoint + "_server" init_fdw( engine, server_id, "mongo_fdw", { "address": server, "port": str(port) }, { "username": username, "password": password }, ) engine.run_sql( SQL("""CREATE SCHEMA IF NOT EXISTS {}""").format( Identifier(mountpoint))) # Parse the table spec # {table_name: {db: remote_db_name, coll: remote_collection_name, schema: {col1: type1, col2: type2...}}} for table_name, table_options in table_spec.items(): logging.info("Mounting table %s", table_name) table_schema = table_options.get("schema", {}) table_schema["_id"] = "NAME" _create_foreign_table( engine, local_schema=mountpoint, table_name=table_name, schema_spec=table_schema, server_id=server_id, server_options={ "database": table_options["db"], "collection": table_options["coll"] }, )
def test_mount_mysql(local_engine_empty): try: _mount_mysql(MYSQL_MNT) # Gotchas: bool coerced to int assert (2, "deathcap", dt(2018, 3, 17, 8, 6, 26), 0) in get_engine().run_sql( """SELECT mushroom_id, name, discovery, friendly FROM "test/mysql_mount".mushrooms WHERE friendly = 0""" ) finally: MYSQL_MNT.delete()
def _set_engine(ctx, param, value): if not value: return try: from splitgraph.engine import get_engine, set_engine engine = get_engine(value) set_engine(engine) except KeyError: raise click.BadParameter("Unknown remote %s!" % value)
def pg_repo_remote_registry(local_engine_empty, remote_engine_registry, clean_minio): staging = Repository("test", "pg_mount_staging") staging = make_pg_repo(get_engine(), staging) result = staging.push( Repository(REMOTE_NAMESPACE, "pg_mount", engine=remote_engine_registry), handler="S3", handler_options={}, ) staging.delete() staging.objects.cleanup() yield result
def upgrade_engine_c(ctx, image, no_pull, name): """Upgrade a Splitgraph engine. This consists of shutting down the current Splitgraph engine, deleting its Docker container (keeping the actual data and metadata volumes intact), creating a container based on a newer image and finally reinitializing the engine to perform needed migrations. """ from splitgraph.engine import get_engine # Get reference to engine to extract its connection params if name == DEFAULT_ENGINE: engine = get_engine() else: engine = get_engine(name) username = engine.conn_params["SG_ENGINE_USER"] password = engine.conn_params["SG_ENGINE_PWD"] port = engine.conn_params["SG_ENGINE_PORT"] # Stop the engine ctx.invoke(stop_engine_c, name=name) # Delete the container ctx.invoke(delete_engine_c, name=name, yes=True) # Create and start new engine ctx.invoke( add_engine_c, image=image, port=port, username=username, password=password, no_sgconfig=True, no_pull=no_pull, name=name, ) version = engine.splitgraph_version if version: click.echo("Upgraded engine %s to %s" % (name, version))
def cleanup_c(): """ Prune unneeded objects from the engine. This deletes all objects from the cache that aren't required by any local repository. """ from splitgraph.core.object_manager import ObjectManager from splitgraph.engine import get_engine from ..core.output import pluralise deleted = ObjectManager(get_engine()).cleanup() click.echo("Deleted %s." % pluralise("object", len(deleted)))
def unprivileged_remote_engine(remote_engine_registry): remote_engine_registry.commit() remote_engine_registry.close() # Assuption: unprivileged_remote_engine is the same server as remote_engine_registry but with an # unprivileged user. engine = get_engine("unprivileged_remote_engine") engine.close() try: yield engine finally: engine.rollback() engine.close()
def test_remote_engine(): # A remote (registry-like) Splitgraph engine fixture. engine = get_engine(REMOTE_ENGINE) if os.getenv("SG_TEST_SKIP_REMOTE_INIT"): logging.info("Skipping initializing the test remote Splitgraph engine...") return engine logging.info("Initializing the test remote Splitgraph engine...") engine.initialize() engine.commit() logging.info("Test remote Splitgraph engine initialized.") return engine
def _make_push_target(repository, remote): from splitgraph.core.repository import Repository from splitgraph.engine import get_engine try: namespace = get_from_subsection(CONFIG, "remotes", remote, "SG_NAMESPACE") except KeyError: namespace = None remote_repository = Repository.from_template(repository, namespace=namespace, engine=get_engine(remote)) return remote_repository
def sql_c(sql, schema, image, show_all, json, no_transaction): """ Run an SQL statement against the Splitgraph engine. There are no restrictions on the contents of the statement: this is the same as running it from any other PostgreSQL client. If ``--schema`` is specified, the statement is run with the ``search_path`` set to that schema. This means that these statements are equivalent: \b ``` sgr sql "SELECT * FROM \"noaa/climate\".table" sgr sql -s noaa/climate "SELECT * FROM table" ``` If `--image` is specified, this will run the statement against that image using layered querying. Only read-only statements are supported. For example: \b ``` sgr sql -i noaa/climate:latest "SELECT * FROM table" ``` """ from splitgraph.engine import get_engine if schema and image: raise click.UsageError("Only one of --schema and --image can be specified!") engine = get_engine() if no_transaction: engine.autocommit = True if not image: if schema: engine.run_sql("SET search_path TO %s", (schema,)) results = engine.run_sql(sql) else: repo, image = image with image.query_schema() as s: results = engine.run_sql_in(s, sql) if results is None: return if len(results) > 10 and not show_all: click.echo(_to_str(results[:10], json)) if not json: click.echo("...") else: click.echo(_to_str(results, json))
def test_mount_mysql(local_engine_empty): try: # Mount MySQL with a set schema instead of letting the FDW detect it _mount_mysql(MYSQL_MNT) result = get_engine().run_sql( """SELECT mushroom_id, name, discovery, friendly, binary_data, varbinary_data FROM "test/mysql_mount".mushrooms""") assert len(result) == 2 assert any(r[1] == "deathcap" and r[2] == dt(2018, 3, 17, 8, 6, 26) for r in result) # Check binary -> bytea conversion works (the data is binary-encoded 127.0.0.1 IP) assert sorted(result, key=lambda r: r[0])[0][5].hex() == "7f000001" finally: MYSQL_MNT.delete()
def _get_ddn_conn_params(remote: str) -> Dict[str, Optional[str]]: from splitgraph.engine import get_engine try: engine = get_engine(remote) except KeyError as e: raise click.UsageError( "Remote %s or API key/secret not found in the config. " "Try registering with sgr cloud register or logging in " "with sgr cloud login / sgr cloud login-api.") from e ddn_params = engine.conn_params.copy() ddn_params["SG_ENGINE_DB_NAME"] = _DDN_DBNAME return ddn_params
def _eval(command, args): # appease PyCharm # noinspection PyUnresolvedReferences from splitgraph.core.repository import Repository from splitgraph.engine import get_engine from splitgraph.core.object_manager import ObjectManager engine = get_engine() object_manager = ObjectManager(object_engine=engine, metadata_engine=engine) command_locals = locals().copy() command_locals.update({k: v for k, v in args}) exec(command, globals(), command_locals)
def _determine_push_target(repository, remote_repository, remote): """ Create the remote Repository object we're pushing to based on all the parameters we've been passed The reason for this behaviour is to streamline out-of-the-box Splitgraph setups where data.splitgraph.com is the only registered engine. In that case: * sgr push repo: will push to myself/repo on data.splitgraph.com with S3 uploading (user's namespace). * sgr push noaa/climate: will push to myself/climate * sgr push noaa/climate noaa/climate: will explicitly push to noaa/climate (assuming the user can write to that repository). If the user registers another registry at splitgraph.mycompany.com, then they will be able to do: * sgr push noaa/climate -r splitgraph.mycompany.com: will push to noaa/climate :param repository: Local Repository, required. :param remote_repository: remote Repository (without the remote engine), optional. :param remote: Name of the remote engine/registry, optional. :return: """ from splitgraph.core.repository import Repository from splitgraph.engine import get_engine if remote_repository and remote: remote_repository = Repository.from_template(remote_repository, engine=get_engine(remote)) elif remote: remote_repository = _make_push_target(repository, remote) elif remote_repository: remote_repository = Repository.from_template( remote_repository, engine=get_engine(_get_default_remote())) else: remote_repository = repository.upstream or _make_push_target( repository, _get_default_remote()) return remote_repository