def docker_build(uid, gid, image, docker_args): """Build a Docker image for the project.""" uid, gid = get_uid_gid(uid, gid) project_path = get_project_context("project_path") image = image or str(project_path.name) template_path = Path(__file__).parent / "template" verbose = get_project_context("verbose") copy_template_files( project_path, template_path, ["Dockerfile", ".dockerignore", ".dive-ci"], verbose, ) combined_args = compose_docker_run_args( required_args=[ ("--build-arg", "KEDRO_UID={0}".format(uid)), ("--build-arg", "KEDRO_GID={0}".format(gid)), ], # add image tag if only it is not already supplied by the user optional_args=[("-t", image)], user_args=docker_args, ) command = ["docker", "build"] + combined_args + [str(project_path)] call(command)
def _get_pipeline_catalog_from_kedro14(env): try: pipeline = get_project_context("create_pipeline")() get_config = get_project_context("get_config") conf = get_config(str(Path.cwd()), env) create_catalog = get_project_context("create_catalog") catalog = create_catalog(config=conf) return pipeline, catalog except (ImportError, KeyError): raise KedroCliError(ERROR_PROJECT_ROOT)
def docker_init(spark): """Initialize a Dockerfile for the project.""" project_path = get_project_context("project_path") template_path = Path(__file__).parent / "template" verbose = get_project_context("verbose") docker_file_version = "spark" if spark else "simple" docker_file = f"Dockerfile.{docker_file_version}" copy_template_files( project_path, template_path, [docker_file, ".dockerignore", ".dive-ci"], verbose, )
def create(): """Create an Airflow DAG for a project""" src_file = Path(__file__).parent / "dag_template.py" dest_file = _get_dag_filename() dest_file.parent.mkdir(parents=True, exist_ok=True) template = Template(src_file.read_text(encoding="utf-8"), keep_trailing_newline=True) try: from kedro.context import ( # noqa:F401 pylint: disable=unused-import load_context, ) context_compatibility_mode = False except ImportError: # pragma: no coverage context_compatibility_mode = True dest_file.write_text( template.render( project_name=get_project_context("project_name"), project_path=get_project_context("project_path"), context_compatibility_mode=context_compatibility_mode, ), encoding="utf-8", ) secho("") secho("An Airflow DAG has been generated in:", fg="green") secho(str(dest_file)) secho("This file should be copied to your Airflow DAG folder.", fg="yellow") secho("The Airflow configuration can be customized by editing this file.", fg="green") secho("") secho( "This file also contains the path to the config directory, this directory will need to " "be available to Airflow and any workers.", fg="yellow", ) secho("") secho( "Additionally all data sets must have an entry in the data catalog.", fg="yellow", ) secho( "And all local paths in both the data catalog and log config must be absolute paths.", fg="yellow", ) secho("")
def argokedro(image, templates_folder, ytt, namespace): """Creates an argo pipeline yaml """ pc = cli.get_project_context() pipeline = pc.pipeline project_name = pc.project_name parameters = pc.catalog.load("parameters") pretty_params = transform_parameters(parameters) dependencies = pipeline.node_dependencies deps_dict = get_deps_dict(dependencies) tags = get_tags(pipeline) tagged_deps_dict = update_deps_dict_with_tags(deps_dict, tags) kedro_dict = { "tasks": tagged_deps_dict, "image": image, "project_name": project_name, "parameters": pretty_params, "namespace": namespace, } kedro_yaml = generate_yaml(kedro_dict) if ytt: kedro_yaml = ytt_add_values_part(kedro_yaml) copy_template(templates_folder, ytt) logging.info(f"YTT template saved in {templates_folder} folder") save_yaml(kedro_yaml, templates_folder) logging.info(f"Kedro template saved in {templates_folder} folder") if ytt: click.secho(FINISHED_MESSAGE_YTT)
def import_line(name): """generate an import line for something in the project_context""" func = get_project_context(name) res = "from {} import {}".format(func.__module__, func.__name__) if func.__name__ != name: res = "{} as {}".format(res, name) return res
def _mount_info() -> Dict[str, Union[str, Tuple]]: project_path = get_project_context("project_path") res = dict( host_root=str(project_path), container_root="/home/kedro", mount_volumes=DOCKER_DEFAULT_VOLUMES, ) return res
def nodes(): """Serve the pipeline data.""" pipeline = get_project_context("create_pipeline")() return jsonify([{ "name": n.name, "inputs": [ds.split("@")[0] for ds in n.inputs], "outputs": [ds.split("@")[0] for ds in n.outputs], "tags": list(n.tags), } for n in pipeline.nodes])
def create(): """Create an Airflow DAG for a project""" src_file = Path(__file__).parent / "dag_template.py" dest_file = _get_dag_filename() dest_file.parent.mkdir(parents=True, exist_ok=True) template = Template(src_file.read_text(encoding="utf-8"), keep_trailing_newline=True) dest_file.write_text( template.render( project_name=get_project_context("project_name"), import_get_config=import_line("get_config"), import_create_catalog=import_line("create_catalog"), import_create_pipeline=import_line("create_pipeline"), project_path=get_project_context("project_path"), ), encoding="utf-8", ) secho("") secho("An Airflow DAG has been generated in:", fg="green") secho(str(dest_file)) secho("This file should be copied to your Airflow DAG folder.", fg="yellow") secho("The Airflow configuration can be customized by editing this file.", fg="green") secho("") secho( "This file also contains the path to the config directory, this directory will need to " "be available to Airflow and any workers.", fg="yellow", ) secho("") secho( "Additionally all data sets must have an entry in the data catalog.", fg="yellow", ) secho( "And all local paths in both the data catalog and log config must be absolute paths.", fg="yellow", ) secho("")
def nodes_json(): """Serve the pipeline data.""" def pretty_name(name): name = name.replace("-", " ").replace("_", " ") parts = [n[0].upper() + n[1:] for n in name.split()] return " ".join(parts) pipeline = get_project_context("create_pipeline")() nodes = [] edges = [] namespace_tags = defaultdict(set) all_tags = set() for node in sorted(pipeline.nodes): task_id = "task/" + node.name.replace(" ", "") nodes.append({ "type": "task", "id": task_id, "name": getattr(node, "short_name", node.name), "full_name": str(node), "tags": sorted(node.tags), }) all_tags.update(node.tags) for data_set in node.inputs: namespace = data_set.split("@")[0] edges.append({"source": "data/" + namespace, "target": task_id}) namespace_tags[namespace].update(node.tags) for data_set in node.outputs: namespace = data_set.split("@")[0] edges.append({"source": task_id, "target": "data/" + namespace}) namespace_tags[namespace].update(node.tags) for namespace, tags in sorted(namespace_tags.items()): nodes.append({ "type": "data", "id": "data/" + namespace, "name": pretty_name(namespace), "full_name": namespace, "tags": sorted(tags), "is_parameters": bool("param" in namespace.lower()), }) tags = [] for tag in sorted(all_tags): tags.append({"id": tag, "name": pretty_name(tag)}) return jsonify( {"snapshots": [{ "nodes": nodes, "edges": edges, "tags": tags }]})
def get_data_from_kedro(): """ Get pipeline data from Kedro and format it appropriately """ def pretty_name(name): name = name.replace("-", " ").replace("_", " ") parts = [n[0].upper() + n[1:] for n in name.split()] return " ".join(parts) pipeline = get_project_context("create_pipeline")() nodes = [] edges = [] namespace_tags = defaultdict(set) all_tags = set() for node in sorted(pipeline.nodes, key=lambda n: n.name): task_id = _hash(str(node)) nodes.append({ "type": "task", "id": task_id, "name": getattr(node, "short_name", node.name), "full_name": getattr(node, "_func_name", str(node)), "tags": sorted(node.tags), }) all_tags.update(node.tags) for data_set in node.inputs: namespace = data_set.split("@")[0] edges.append({"source": _hash(namespace), "target": task_id}) namespace_tags[namespace].update(node.tags) for data_set in node.outputs: namespace = data_set.split("@")[0] edges.append({"source": task_id, "target": _hash(namespace)}) namespace_tags[namespace].update(node.tags) for namespace, tags in sorted(namespace_tags.items()): is_param = bool("param" in namespace.lower()) nodes.append({ "type": "parameters" if is_param else "data", "id": _hash(namespace), "name": pretty_name(namespace), "full_name": namespace, "tags": sorted(tags), }) tags = [] for tag in sorted(all_tags): tags.append({"id": tag, "name": pretty_name(tag)}) return {"nodes": nodes, "edges": edges, "tags": tags}
def docker_build(ctx, uid, gid, spark, base_image, image, docker_args): """Build a Docker image for the project.""" uid, gid = get_uid_gid(uid, gid) project_path = get_project_context("project_path") image = image or str(project_path.name) ctx.invoke(docker_init, spark=spark) combined_args = compose_docker_run_args( required_args=[ ("--build-arg", f"KEDRO_UID={uid}"), ("--build-arg", f"KEDRO_GID={gid}"), ("--build-arg", f"BASE_IMAGE={base_image}"), ], # add image tag if only it is not already supplied by the user optional_args=[("-t", image)], user_args=docker_args, ) command = ["docker", "build"] + combined_args + [str(project_path)] call(command)
def _call_viz( host=None, port=None, browser=None, load_file=None, save_file=None, pipeline_name=None, env=None, ): global data # pylint: disable=global-statement,invalid-name if load_file: data = _load_from_file(load_file) else: if match(kedro.__version__, ">=0.15.0"): from kedro.context import KedroContextError try: context = get_project_context("context", env=env) pipeline = _get_pipeline_from_context(context, pipeline_name) except KedroContextError: raise KedroCliError(ERROR_PROJECT_ROOT) catalog = context.catalog else: # Kedro 0.14.* if pipeline_name: raise KedroCliError(ERROR_PIPELINE_FLAG_NOT_SUPPORTED) pipeline, catalog = _get_pipeline_catalog_from_kedro14(env) data = format_pipeline_data(pipeline, catalog) if save_file: Path(save_file).write_text(json.dumps(data, indent=4, sort_keys=True)) else: if browser: webbrowser.open_new("http://127.0.0.1:{:d}/".format(port)) app.run(host=host, port=port)
def profile(name): """ Kedro plugin for utilizing Pandas Profiling """ conf_dict = kedro_conf_path() catalog_df = get_catalog_details(conf_dict) project_path = get_project_context("project_path") if name == None: print(catalog_df) else: data_path = catalog_df.at[name, "filepath"] data = pd_reader(project_path / data_path) print(f"Profiling {name} DataSet...") profile = data.profile_report(title=f"DataSet {name} - Profile Report", pool_size=0) output_path = Path.joinpath(project_path, f"data/08_reporting/{name}.html") profile.to_file(output_file=output_path) print(f"{name.title()} profile printed to {output_path}") return None
def test_get_context_without_project_path(self, mocked_load_context): dummy_context = get_project_context("context") mocked_load_context.assert_called_once_with(Path.cwd()) assert isinstance(dummy_context, DummyContext)
def test_verbose(self): assert not get_project_context("verbose")
def test_project_path(self): key = "project_path" pattern = self._deprecation_msg(key) with warns(DeprecationWarning, match=pattern): assert get_project_context(key) == "dummy_path"
def test_template_version(self): key = "template_version" pattern = self._deprecation_msg(key) with warns(DeprecationWarning, match=pattern): assert get_project_context(key) == "dummy_version"
def test_create_pipeline(self): key = "create_pipeline" pattern = self._deprecation_msg(key) with warns(DeprecationWarning, match=pattern): pipeline = get_project_context(key) assert pipeline() == "pipeline"
def test_create_catalog(self): key = "create_catalog" pattern = self._deprecation_msg(key) with warns(DeprecationWarning, match=pattern): catalog = get_project_context(key) assert catalog("config") == "catalog"
def test_get_config(self, tmp_path): key = "get_config" pattern = self._deprecation_msg(key) with warns(DeprecationWarning, match=pattern): config_loader = get_project_context(key) assert config_loader(tmp_path) == "config_loader"
def test_context(self): dummy_context = get_project_context("context") assert isinstance(dummy_context, DummyContext)
def _get_dag_filename(): project_path = get_project_context("project_path") project_name = get_project_context("project_name") dest_dir = project_path / "airflow_dags" return dest_dir / (slugify(project_name, separator="_") + "_dag.py")
def _image_callback(ctx, param, value): # pylint: disable=unused-argument image = value or str(get_project_context("project_path").name) check_docker_image_exists(image) return image
def test_get_context_with_project_path(self, tmpdir, mocked_load_context): dummy_project_path = tmpdir.mkdir("dummy_project") dummy_context = get_project_context("context", project_path=dummy_project_path) mocked_load_context.assert_called_once_with(dummy_project_path) assert isinstance(dummy_context, DummyContext)