def _get_rules_file_keys( key_type: str, matches: Dict[str, Dict[str, bool]], rules_file: str, ) -> Dict[str, List[str]]: result: Dict[str, List[str]] = {} if key_type not in ["primary-key", "replication-key"]: raise ValueError( f"Unexpected key type '{key_type}'. " "Expected: 'replication-key' or 'primary-key'" ) # Check rules_file to fill `matches` plan_file_lines = uio.get_text_file_contents(rules_file).splitlines() key_overrides = [ line.split("->")[0].rstrip() for line in plan_file_lines if "->" in line and line.split("->")[1].lstrip().rstrip() == key_type ] for key_spec in key_overrides: if len(key_spec.split(".")) != 2 or "*" in key_spec: raise ValueError( f"Expected '{key_type}' indicator with exact two-part key, separated " f"by '.'. Found '{key_spec}'" ) table_name, key_col_name = key_spec.split(".") if table_name not in matches: raise ValueError(f"Could not locate table '{table_name}' in selected list.") if key_col_name not in matches[table_name]: raise ValueError(f"Key column '{key_spec}' is not in column list.") elif not matches[table_name][key_col_name]: raise ValueError(f"Key column '{key_spec}' is not a selected column.") result[table_name] = [key_col_name] return result
def change_upstream_source( dir_to_update=".", git_repo="https://github.com/slalom-ggp/dataops-infra", branch="master", relative_path="../../dataops-infra", to_relative=False, to_git=False, dry_run=False, ): """Change Terraform source""" if to_relative and to_git or not (to_relative or to_git): raise ValueError( "Must specify `--to_git` or `--to_relative`, but not both.") for tf_file in uio.list_local_files(dir_to_update, recursive=False): if tf_file.endswith(".tf"): # print(tf_file) new_lines = [] for line in uio.get_text_file_contents(tf_file).splitlines(): new_line = line if line.lstrip().startswith("source "): current_path = line.lstrip().split('"')[1] start_pos = max([ current_path.find("catalog/"), current_path.find("components/") ]) if start_pos > 0: module_path = current_path[start_pos:].split( "?ref=")[0] if to_relative: local_patten = "{relative_path}/{path}" new_path = local_patten.format( relative_path=relative_path, path=module_path) elif to_git: git_pattern = "git::{git_repo}//{path}?ref={branch}" new_path = git_pattern.format(git_repo=git_repo, path=module_path, branch=branch) if current_path == new_path: print(f"{current_path} \n\t\t\t-> (unchanged)") else: print(f"{current_path} \n\t\t\t-> {new_path}") new_line = f' source = "{new_path}"' new_lines.append(new_line) new_file_text = "\n".join(new_lines) if dry_run: print(f"\n\n------------\n-- {tf_file}\n------------") print(new_file_text) else: uio.create_text_file(tf_file, new_file_text) if not dry_run: runnow.run("terraform fmt -recursive", dir_to_update)
def start_jupyter(nb_directory="/home/jovyan/work", nb_token="qwerty123"): jupyter_run_command = ( f"jupyter lab" f" --NotebookApp.notebook_dir='{nb_directory}'" f" --NotebookApp.token='{nb_token}'" f" --allow-root" ) log_file = "jupyter_log.txt" runnow.run(jupyter_run_command, daemon=True, log_file_path=log_file) time.sleep(5) logging.info("\nJUPYTER_LOG:".join(uio.get_text_file_contents(log_file).splitlines())) logging.info( "Jupyter notebooks server started at: https://localhost:8888/?token=qwerty123" )
def _get_plugins_list( plugins_index: Optional[str] = None, ) -> List[Tuple[str, str, str]]: plugins_index = plugins_index or SINGER_PLUGINS_INDEX if not uio.file_exists(plugins_index): raise RuntimeError(f"No file found at '{plugins_index}'." "Please set SINGER_PLUGINS_INDEX and try again.") yml_doc = yaml.safe_load(uio.get_text_file_contents(plugins_index)) taps = yml_doc["singer-taps"] list_of_tuples = [] taps = yml_doc["singer-taps"] targets = yml_doc["singer-targets"] plugins = taps + targets for plugin in plugins: list_of_tuples.append(( plugin["name"], plugin.get("source", None), plugin.get("alias", None), )) return list_of_tuples
def _discover( tap_name: str, taps_dir: str, *, config_file: str, catalog_dir: str, dockerized: bool, tap_exe: str, ) -> None: catalog_file = config.get_raw_catalog_file( taps_dir, catalog_dir, tap_name, allow_custom=False ) uio.create_folder(catalog_dir) img = f"{docker.BASE_DOCKER_REPO}:{tap_exe}" hide_cmd = False if dockerized: cdw = os.getcwd().replace("\\", "/") tap_config = json.loads(uio.get_text_file_contents(config_file)) tap_docker_args = "" # TODO: Replace with logic to parse from AWS_SHARED_CREDENTIALS_FILE env var: for k in ["aws_access_key_id", "aws_secret_access_key", "aws_session_token"]: if k in tap_config: key = f"TAP_{tap_name}_{k}".replace("-", "_").upper() os.environ[key] = tap_config[k] tap_docker_args += f' -e {k.upper()}="{tap_config[k]}"' hide_cmd = True _, _ = runnow.run(f"docker pull {img}") _, output_text = runnow.run( f"docker run --rm -i " f"-v {cdw}:/home/local {tap_docker_args} " f"{img} --config {config.dockerize_cli_args(config_file)} --discover", echo=False, capture_stderr=False, hide=hide_cmd, ) if not _is_valid_json(output_text): raise RuntimeError(f"Could not parse json file from output:\n{output_text}") uio.create_text_file(catalog_file, output_text) else: runnow.run( f"{tap_exe} --config {config_file} --discover > {catalog_file}", hide=hide_cmd, )
def _smart_split(dockerfile_path, image_name, addl_args=None): """ Returns list of tuples: [ (partial_image_name, partial_dockerfile_text) (derived_image_name, derived_dockerfile_text) ] Create two dockerfiles from a single file. 1. The first 'core' image will contain all statements until the first COPY or ADD. 2. The second 'derived' image will pull from 'core' and complete the build using local files or artifacts required by ADD or COPY commands. """ orig_text = uio.get_text_file_contents(dockerfile_path) addl_args = addl_args or "" core_dockerfile = "" derived_dockerfile = "" requires_context = False # Whether we need file context to determine output for line in orig_text.split("\n"): if any([line.startswith("COPY"), line.startswith("ADD")]): requires_context = True if not requires_context: core_dockerfile += line + "\n" else: derived_dockerfile += line + "\n" core_md5 = hashlib.md5( (addl_args + core_dockerfile).encode("utf-8")).hexdigest() full_md5 = hashlib.md5((addl_args + orig_text).encode("utf-8")).hexdigest() core_image_name = f"{image_name}:core-md5-{core_md5}" derived_image_name = f"{image_name}:md5-{full_md5}" core_dockerfile = ( f"# NO NOT EDIT - file is generated automatically from `Dockerfile`\n\n" f"# Dockerfile.core - will be created and pushed as:\n" f"# \t{core_image_name}\n\n{core_dockerfile}") if derived_dockerfile: derived_dockerfile = ( f"# NO NOT EDIT - file is generated automatically from `Dockerfile`\n\n" f"FROM {core_image_name}\n\n{derived_dockerfile}") else: derived_dockerfile = None # No additional work to do. return [(core_image_name, core_dockerfile), (derived_image_name, derived_dockerfile)]
def _create_selected_catalog( tap_name: str, plan_file: str, raw_catalog_file: str, output_file: str, replication_strategy: str, skip_senseless_validators: bool, ) -> None: taps_dir = config.get_taps_dir() catalog_dir = config.get_tap_output_dir(tap_name, taps_dir) output_file = output_file or os.path.join(catalog_dir, "selected-catalog.json") catalog_full = json.loads(Path(raw_catalog_file).read_text()) plan_file = plan_file or config.get_plan_file(tap_name) plan = yaml.safe_load(uio.get_text_file_contents(plan_file)) if ("selected_tables" not in plan) or (plan["selected_tables"] is None): raise ValueError(f"No selected tables found in plan file '{plan_file}'.") included_table_objects = [] for tbl in sorted(catalog_full["streams"], key=lambda x: _get_stream_name(x)): stream_name = _get_stream_name(tbl) stream_id = _get_stream_id(tbl) if stream_name in plan["selected_tables"].keys() and stream_id == plan[ "selected_tables" ][stream_name].get("stream_id", stream_name): _set_catalog_file_keys(tbl, plan["selected_tables"][stream_name]) _select_table(tbl, replication_strategy=replication_strategy) for col_name in _get_catalog_table_columns(tbl): col_selected = col_name in ( (plan["selected_tables"][stream_name]["selected_columns"] or []) + (plan["selected_tables"][stream_name]["replication_key"] or []) + (plan["selected_tables"][stream_name]["primary_key"] or []) ) _select_table_column(tbl, col_name, col_selected) if skip_senseless_validators: _remove_senseless_validators(tbl) included_table_objects.append(tbl) catalog_new = {"streams": included_table_objects} with open(output_file, "w") as f: json.dump(catalog_new, f, indent=2)
def make_aggregate_state_file(raw_json_lines_file: str, output_json_file: str) -> None: """ Create a valid json state file from one or more json lines ('jsonl' format). Parameters ---------- raw_json_lines_file : str Path to a jsonl (json lines) file containing one or more json documents to aggregate. output_json_file : str Path to use when saving the aggregated json file. """ try: uio.create_text_file( output_json_file, get_aggregate_state( uio.get_text_file_contents(raw_json_lines_file)), ) except ValueError as ex: raise ValueError( f"State file from '{raw_json_lines_file}' is not valid JSON or JSONL. " f"Please either delete or fix the file and then retry. {ex}")
def _sync_one_table( tap_name: str, table_name: str, taps_dir: str, config_file: str, target_name: str, target_config_file: str, table_catalog_file: str, table_state_file: str, log_dir: str, dockerized: bool, tap_exe: str, target_exe: str, ) -> None: if not tap_exe: tap_exe = f"tap-{tap_name}" pipeline_version_num = config.get_pipeline_version_number() table_state_file = config.replace_placeholders( {"table_state_file": table_state_file}, tap_name, table_name, pipeline_version_num, )["table_state_file"] tap_args = f"--config {config_file} --catalog {table_catalog_file} " if uio.file_exists(table_state_file): local_state_file_in = os.path.join( config.get_tap_output_dir(tap_name, taps_dir), f"{tap_name}-{table_name}-state.json", ) if not uio.get_text_file_contents(table_state_file): logging.warning( f"Ignoring blank state file from '{table_state_file}'.") else: states.make_aggregate_state_file(table_state_file, local_state_file_in) tap_args += f" --state {local_state_file_in}" local_state_file_out = ( f"{'.'.join(local_state_file_in.split('.')[:-1])}-new.json") else: local_state_file_out = os.path.join( config.get_tap_output_dir(tap_name, taps_dir), f"{tap_name}-{table_name}-state-new.json", ) tmp_target_config = config.get_single_table_target_config_file( target_name, target_config_file, tap_name=tap_name, table_name=table_name, pipeline_version_num=pipeline_version_num, ) target_args = f"--config {tmp_target_config} " hide_cmd = False if dockerized: cdw = os.getcwd().replace("\\", "/") tap_image_name = docker._get_docker_tap_image(tap_exe) target_image_name = docker._get_docker_tap_image(target_exe=target_exe) _, _ = runnow.run(f"docker pull {tap_image_name}") _, _ = runnow.run(f"docker pull {target_image_name}") tap_config = json.loads(uio.get_text_file_contents(config_file)) target_config = json.loads( uio.get_text_file_contents(target_config_file)) tap_docker_args = "" target_docker_args = "" # TODO: Replace with logic to parse from AWS_SHARED_CREDENTIALS_FILE env var: for k in [ "aws_access_key_id", "aws_secret_access_key", "aws_session_token" ]: if k in tap_config: key = f"TAP_{tap_name}_{k}".replace("-", "_").upper() os.environ[key] = tap_config[k] tap_docker_args += f' -e {k.upper()}="{tap_config[k]}"' hide_cmd = True if k in target_config: key = f"TARGET_{target_name}_{k}".replace("-", "_").upper() os.environ[key] = target_config[k] target_docker_args += f' -e {k.upper()}="{target_config[k]}"' hide_cmd = True sync_cmd = ( f"docker run --rm -i -v {cdw}:/home/local {tap_docker_args} {tap_image_name} " f"{config.dockerize_cli_args(tap_args)} " "| " f"docker run --rm -i -v {cdw}:/home/local {target_docker_args} {target_image_name} " f"{config.dockerize_cli_args(target_args)} " ">> " f"{local_state_file_out}") else: sync_cmd = (f"{tap_exe} " f"{tap_args} " "| " f"{target_exe} " f"{target_args} " "> " f"{local_state_file_out}") runnow.run(sync_cmd, hide=hide_cmd) if not uio.file_exists(local_state_file_out): logging.warning( f"State file does not exist at path '{local_state_file_out}'. Skipping upload. " f"This can be caused by having no data, or no new data, in the source table." ) else: uio.upload_file(local_state_file_out, table_state_file)
def update_module_docs( tf_dir: str, *, recursive: bool = True, readme: str = "README.md", footer: bool = True, header: bool = True, special_case_words: List[str] = None, extra_docs_names: List[str] = ["USAGE.md", "NOTES.md"], git_repo: str = "https://github.com/slalom-ggp/dataops-infra", ): """ Replace all README.md files with auto-generated documentation, a wrapper around the `terraform-docs` tool. Parameters: ---------- tf_dir: Directory of terraform scripts to document. recursive : Optional (default=True). 'True' to run on all subdirectories, recursively. readme : Optional (default="README.md"). The filename to create when generating docs. footnote: Optional (default=True). 'True' to include the standard footnote. special_case_words: Optional. A list of words to override special casing rules. extra_docs_names: (Optional.) A list of filenames which, if found, will be appended to each module's README.md file. git_repo: Optional. The git repo path to use in rendering 'source' paths. Returns: ------- None """ markdown_text = "" if ".git" not in tf_dir and ".terraform" not in tf_dir: tf_files = [ x for x in uio.list_local_files(tf_dir, recursive=False) if x.endswith(".tf") ] extra_docs = [ x for x in uio.list_local_files(tf_dir, recursive=False) if extra_docs_names and os.path.basename(x) in extra_docs_names ] if tf_files: module_title = _proper(os.path.basename(tf_dir), special_case_words=special_case_words) parent_dir_name = os.path.basename(Path(tf_dir).parent) if parent_dir_name != ".": module_title = _proper( f"{parent_dir_name} {module_title}", special_case_words=special_case_words, ) module_path = tf_dir.replace(".", "").replace("//", "/").replace("\\", "/") _, markdown_output = runnow.run( f"terraform-docs md --no-providers --sort-by-required {tf_dir}", echo=False, ) if header: markdown_text += DOCS_HEADER.format(module_title=module_title, module_path=module_path) markdown_text += markdown_output for extra_file in extra_docs: markdown_text += uio.get_text_file_contents(extra_file) + "\n" if footer: markdown_text += DOCS_FOOTER.format(src="\n".join([ "* [{file}]({repo}/tree/master/{dir}/{file})".format( repo=git_repo, dir=module_path, file=os.path.basename(tf_file), ) for tf_file in tf_files ])) uio.create_text_file(f"{tf_dir}/{readme}", markdown_text) if recursive: for folder in uio.list_local_files(tf_dir, recursive=False): if os.path.isdir(folder): update_module_docs(folder, recursive=recursive, readme=readme)
def update_module_docs( tf_dir: str, *, recursive: bool = True, readme: str = "README.md", footer: bool = True, header: bool = True, special_case_words: List[str] = None, extra_docs_names: List[str] = ["USAGE.md", "NOTES.md"], git_repo: str = "https://github.com/slalom-ggp/dataops-infra", ) -> None: """ Replace all README.md files with auto-generated documentation This is a wrapper around the `terraform-docs` tool. Parameters ---------- tf_dir : str Directory of terraform scripts to document. recursive : bool, optional Run on all subdirectories, recursively. By default True. readme : str, optional The filename to create when generating docs, by default "README.md". footer : bool, optional Include the standard footnote, by default True. header : bool, optional Include the standard footnote, by default True. special_case_words : List[str], optional A list of words to override special casing rules, by default None. extra_docs_names : List[str], optional A list of filenames which, if found, will be appended to each module's README.md file, by default ["USAGE.md", "NOTES.md"]. git_repo : str, optional The git repo path to use in rendering 'source' paths, by default "https://github.com/slalom-ggp/dataops-infra". """ markdown_text = "" if ".git" not in tf_dir and ".terraform" not in tf_dir: tf_files = [ x for x in uio.list_local_files(tf_dir, recursive=False) if x.endswith(".tf") ] extra_docs = [ x for x in uio.list_local_files(tf_dir, recursive=False) if extra_docs_names and os.path.basename(x) in extra_docs_names ] if tf_files: module_title = _proper(os.path.basename(tf_dir), special_case_words=special_case_words) parent_dir_name = os.path.basename(Path(tf_dir).parent) # parent_title = _proper( # parent_dir_name, special_case_words=special_case_words, # ) module_title = _proper( f"{parent_dir_name} {module_title}", special_case_words=special_case_words, ) module_path = tf_dir.replace(".", "").replace("//", "/").replace("\\", "/") _, markdown_output = runnow.run( f"terraform-docs markdown document --sort=false {tf_dir}", # " --no-requirements" echo=False, ) if "components" in module_path.lower(): module_type = "Components" elif "catalog" in module_path.lower(): module_type = "Catalog" else: module_type = "Other" if header: markdown_text += DOCS_HEADER.format( module_title=module_title, module_path=module_path, module_type=module_type, ) markdown_text += markdown_output for extra_file in extra_docs: markdown_text += uio.get_text_file_contents(extra_file) + "\n" if footer: markdown_text += DOCS_FOOTER.format(src="\n".join([ "* [{file}]({repo}/tree/main/{dir}/{file})".format( repo=git_repo, dir=module_path, file=os.path.basename(tf_file), ) for tf_file in tf_files ])) uio.create_text_file(f"{tf_dir}/{readme}", markdown_text) if recursive: for folder in uio.list_local_files(tf_dir, recursive=False): if os.path.isdir(folder): update_module_docs(folder, recursive=recursive, readme=readme)
def _infer_schema( tap_name: str, taps_dir: str, raw_catalog_file: str, selected_catalog_file: str, *, config_file: str, catalog_dir: str, dockerized: bool, tap_exe: str, ) -> str: custom_catalog = json.loads(uio.get_text_file_contents(raw_catalog_file)) tmp_folder = f"{catalog_dir}/tmp" tmp_outfile = f"{catalog_dir}/tmp/sync-dryrun.jsonl" uio.create_folder(catalog_dir) uio.create_folder(tmp_folder) logging.info(f"Cleaning up old files in tmp folder '{tmp_folder}'...") for file in uio.list_files(tmp_folder): if any( [ file.endswith(x) for x in ["-config.json", "-dryrun.jsonl", "-table.inferred.json"] ] ): uio.delete_file(file) img = f"{docker.BASE_DOCKER_REPO}:{tap_exe}" hide_cmd = False if dockerized: cdw = os.getcwd().replace("\\", "/") tap_config = json.loads(uio.get_text_file_contents(config_file)) tap_docker_args = "" # TODO: Replace with logic to parse from AWS_SHARED_CREDENTIALS_FILE env var: for k in ["aws_access_key_id", "aws_secret_access_key", "aws_session_token"]: if k in tap_config: key = f"TAP_{tap_name}_{k}".replace("-", "_").upper() os.environ[key] = tap_config[k] tap_docker_args += f' -e {k.upper()}="{tap_config[k]}"' hide_cmd = True _, _ = runnow.run(f"docker pull {img}") _, jsonl_out = runnow.run( f"docker run --rm -i " f"-v {cdw}:/home/local {tap_docker_args} " f"{img} " f"--config {config.dockerize_cli_args(config_file)}" f"--catalog {selected_catalog_file}", hide=hide_cmd, echo=False, capture_stderr=False, ) else: _, jsonl_out = runnow.run( f"{tap_exe} " f"--config {config_file} " f"--catalog {selected_catalog_file}", hide=hide_cmd, echo=False, capture_stderr=False, ) uio.create_text_file(tmp_outfile, jsonl_out) _, jsonl_out = runnow.run( f"cat {tmp_outfile} | singer-infer-schema --out-dir {tmp_folder}", ) for file in uio.list_files(tmp_folder): if file.endswith(".inferred.json"): logging.info(f"Parsing inferred schema from '{file}'...") inferred_schema = json.loads(uio.get_text_file_contents(file)) stream_name = file.split("/")[-1].split(".")[0] stream = ( [x for x in custom_catalog["streams"] if x["stream"] == stream_name] or [None] )[0] if not stream: raise ValueError( f"Failed to append inferred schema for stream name '{stream_name}'." f" Stream not present in catalog file {selected_catalog_file}." ) stream["schema"] = inferred_schema custom_catalog_file = config.get_custom_catalog_file(taps_dir, tap_name) uio.create_text_file(custom_catalog_file, json.dumps(custom_catalog, indent=2)) return custom_catalog_file
def _check_rules( tap_name: str, catalog_file: str, rules_file: str, replication_strategy: str, plan_file_out: str, selected_catalog_file_out: str, log_dir: Optional[str], ) -> None: """ Create plan file and selected catalog file from provided rules and raw catalog. Parameters ---------- catalog_file : str Path to a catalog file. rules_file : str Path to a rules file. plan_file_out : str Path to save the plan file. selected_catalog_file_out : str Path to save the selected catalog file. """ select_rules = [ line.split("#")[0].rstrip() for line in uio.get_text_file_contents(rules_file).splitlines() if line.split("#")[0].rstrip() ] matches: Dict[str, dict] = {} excluded_table_stream_ids: Dict[str, List[str]] = {} matched_stream_ids: Dict[str, str] = {} for stream_id, table_object in _get_catalog_tables_dict(catalog_file).items(): table_name = _get_stream_name(table_object) if _table_match_check( table_name=table_name, stream_id=stream_id, select_rules=select_rules, ): if table_name in matched_stream_ids: raise RuntimeError( f"Table name '{table_name}' matched multiple stream IDs: " f'"{matched_stream_ids[table_name]}" and "{stream_id}". ' "This is most often caused by tables with the same name under " "different source database schemas. Please qualify or disqualify " "specific stream name patterns by using double-quoted stream IDs " "in your rules file instead of or in addition to bare table names." ) matched_stream_ids[table_name] = stream_id matches[table_name] = {} for col_object in _get_catalog_table_columns(table_object): col_name = col_object matches[table_name][col_name] = _col_match_check( table_name, stream_id, col_name, select_rules ) else: if table_name in excluded_table_stream_ids: excluded_table_stream_ids[table_name].append(stream_id) else: excluded_table_stream_ids[table_name] = [stream_id] all_matches_lower = [m.lower() for m in matches.keys()] + [ f'"{m.lower()}"' for m in matched_stream_ids.values() ] declared_tables = set( [ rule.split(".")[0].rstrip().lstrip("!") for rule in select_rules if rule.split(".")[0].rstrip() and ("*" not in rule.split(".")[0]) ] ) for required_table in declared_tables: if required_table.lower() not in all_matches_lower: logging.warning( f"The table '{required_table}' was declared in the rules file " "but could not be found in the catalog." ) for match, match_cols in matches.items(): if not match_cols: logging.warning( f"The table '{match}' was declared in the rules file " "but did not match with any columns in the catalog." ) primary_keys, replication_keys = _get_table_keys( matches, matched_stream_ids, catalog_file, rules_file ) file_text = _make_plan_file_text( matches, primary_keys, replication_keys, matched_stream_ids, excluded_table_stream_ids, ) logging.info(f"Updating plan file: {plan_file_out}") uio.create_text_file(plan_file_out, file_text) config.push_logs(log_dir, [rules_file, plan_file_out]) _create_selected_catalog( tap_name, plan_file=plan_file_out, raw_catalog_file=catalog_file, output_file=selected_catalog_file_out, replication_strategy=replication_strategy, skip_senseless_validators=SKIP_SENSELESS_VALIDATORS, ) config.push_logs(log_dir, [selected_catalog_file_out])
def parse_from_file(filepath: str, /) -> List[Dict[str, List]]: return parse_from_string(uio.get_text_file_contents(filepath))