def run_command(command, env=None): "Runs a shell command and streams output to the log" env = env or {} with subprocess.Popen( shlex.split(command), env={ **os.environ, **env }, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) as proc: while True: output = proc.stdout.readline().decode("utf-8") if len(output) == 0 and proc.poll() is not None: break if output: LOG.info(output.strip()) return_code = proc.poll() if return_code != 0: raise RunCommandError( "Error running shell command, please see log for details.")
def clean_target_test_data(engine, api): "Removes target data from the test database that might be left over from a previous test run" insp = reflection.Inspector.from_engine(engine) namespaces = { target.split(".")[1] for target in api.spec["targets"].keys() } for namespace in namespaces: execute_sqls(engine, [f"CREATE SCHEMA IF NOT EXISTS {namespace}"]) tables = engine.table_names(schema=namespace) LOG.debug("Found existing tables: %s", tables) views = list(insp.get_view_names(schema=namespace)) LOG.debug("Found existing views: %s", views) targets = api.spec["targets"].keys() target_tables = [ target for target in targets if target.split(".")[-1] in tables ] target_views = [ target for target in targets if target.split(".")[-1] in views ] execute_sqls( engine, [f"DROP TABLE IF EXISTS {target}" for target in target_tables], ) execute_sqls( engine, [f"DROP VIEW IF EXISTS {target}" for target in target_views], )
def load_test_data(source_engines, api, schemas_path): "Loads test data generated by dtspec into the test databases" schema_metadata = read_sa_metadata(schemas_path) source_fqn_to_sa = _source_fqn_to_sa(source_engines, schema_metadata) truncate_by_env_sqls = {env: [] for env in source_engines.keys()} insert_by_env_sqls = {env: [] for env in source_engines.keys()} for source_name, data in api.spec["sources"].items(): try: this_source_meta = source_fqn_to_sa[source_name] except KeyError as err: raise KeyError( f"Unable to find source {source_name} in schema metadata: {source_fqn_to_sa.keys()}" ) from err source_insert = (this_source_meta["sa_table"].insert( bind=this_source_meta["engine"]).values( sa_serialize(data.serialize()))) truncate_by_env_sqls[this_source_meta["env"]].append( f"TRUNCATE {source_name}; ") if len(data.serialize()) > 0: insert_by_env_sqls[this_source_meta["env"]].append(source_insert) for env, source_engine in source_engines.items(): LOG.info("Loading test data into source test environment %s", env) execute_sqls(engine=source_engine, sqls=truncate_by_env_sqls[env]) execute_sqls(engine=source_engine, sqls=insert_by_env_sqls[env])
def _init_test_db(config, env=None, clean=False): LOG.info("initializing test db env: %s", env) env_config = config["source_environments"][env] engine = _engine_from_config(env_config["test"]) dtspec.db.init_test_db( env=env, engine=engine, schemas_path=SCHEMAS_PATH, clean=clean )
def _reflect_table(metadata, engine, namespace, table_name): LOG.info("Reflecting table %s.%s", namespace, table_name) return sa.Table( table_name, metadata, autoload=True, autoload_with=engine, schema=namespace, resolve_fks=False, )
def reflect(env, engine, output_path, namespace="public", tables=None): "Reflects all specified tables and saves the table schemas as yaml files" tables = tables or [] metadata = sa.MetaData() reflected_table_names = _reflect_table_names(engine, namespace) selected_table_names = _select_tables(tables, reflected_table_names) LOG.debug("Reflecting tables: %s", selected_table_names) _reflect_tables(metadata, engine, namespace, selected_table_names) _write_yaml(output_path, env, namespace, metadata)
def get_actuals(engine, api): "Extracts data from the targets of the data transformation and serializes them for comparison with expected values" serialized_actuals = {} with engine.connect() as conn: for target in api.spec["targets"].keys(): LOG.info("Fetching actual data for target %s", target) sa_results = conn.execute(f"SELECT * FROM {target}").fetchall() serialized_actuals[target] = { "records": [{key: _stringify_sa_value(val) for key, val in row.items()} for row in sa_results], "columnns": list(sa_results[0].keys()), } return serialized_actuals
def read_sa_metadata(schema_path): """ Reads SQLAlchemy schema metadata saved in yaml files. Returns a dictionary with the following structure: { 'environment name 1': { 'namespace 1': { 'table 1': sqlalchemy.Table object, 'table 2': sqlalchemy.Table object, }, 'namespace 2': { ... }, }, 'environment name 2': { ... }, } """ LOG.debug("Reading schema metadata from path %s", schema_path) metadata = sa.MetaData() schemas = {} for yaml_file in glob.glob(os.path.join(schema_path, "*.schema.yml")): LOG.debug("Reading schema metadata from %s", yaml_file) yaml_basename = os.path.basename(yaml_file) parsed_filename = re.search(r"([^.]+).([^.]+).schema.yml", yaml_basename) env = parsed_filename.group(1) namespace = parsed_filename.group(2) schemas[env] = schemas.get(env, {}) with open(yaml_file, "r") as yfile: yaml_txt = yfile.read() schema_def = yaml.unsafe_load(yaml_txt) schemas[env][namespace] = { table_name: _sa_table_from_yaml(metadata, namespace, table_name, table_def) for table_name, table_def in schema_def.items() } return schemas
def _fetch_schema(config, env): LOG.info("fetching schemas for env: %s", env) env_config = config["source_environments"][env] engine = _engine_from_config(env_config["schema"]) output_path = os.path.join(DTSPEC_ROOT, "schemas") pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) for namespace, tables in env_config["tables"].items(): dtspec.db.reflect( env=env, engine=engine, output_path=output_path, namespace=namespace, tables=tables, )
def run_dbt( cmd="run", profiles_dir=None, target="dev", models=None, exclude=None, full_refresh=False, env=None, partial_parse=False, ): "Construct common dbt parameters and runs dbt in a shell" profiles_dir = profiles_dir or os.environ.get("DBT_PROFILES_DIR", "~/.dbt/") env = env or {} models_cmd = "" if models: models_cmd = f"--models {models}" exclude_cmd = "" if exclude: exclude_cmd = f"--exclude {exclude}" full_refresh_cmd = "" if full_refresh: full_refresh_cmd = "--full-refresh" partial_parse_cmd = "" if partial_parse: partial_parse_cmd = "--partial-parse" shell_cmd = f"dbt {partial_parse_cmd} {cmd} --profiles-dir={profiles_dir} --target={target} {full_refresh_cmd} {models_cmd} {exclude_cmd}" LOG.info("Running dbt via: %s", shell_cmd) try: run_command(shell_cmd, env=env) except RunCommandError as err: raise DbtRunError( f"dbt failed to {cmd} successfully, please see log for details" ) from err
def worker_execute_sqls(engine, worker_sqls): with engine.connect().begin() as trans: for worker_sql in worker_sqls: LOG.debug("Executing sql: %s", worker_sql) trans.connection.execute(worker_sql) trans.commit()
def _get_actuals(config, api, target): target_config = config["target_environments"][target] engine = _engine_from_config(target_config) LOG.info("Fetching results of run from target test environment %s", target) return dtspec.db.get_actuals(engine, api)
def _clean_target_test_data(config, api, target): target_config = config["target_environments"][target] engine = _engine_from_config(target_config) LOG.info("Cleaning out target test data for target test environment %s", target) dtspec.db.clean_target_test_data(engine, api)