def __checkout_branch(self, branch, full_path): logger.info(" checking out branch {}".format(branch)) proc = subprocess.Popen(['git', 'checkout', branch], cwd=full_path, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate()
def __pull_deps_recursive(self, repos, processed_repos=None, i=0): if processed_repos is None: processed_repos = set() for repo_string in repos: repo, branch = self.__split_at_branch(repo_string) repo_folder = folder_from_git_remote(repo) try: if repo_folder in processed_repos: logger.info( "skipping already processed dependency {}".format( repo_folder)) else: dep_folder = self.__pull_repo(repo, branch) dep_project = project.read_project( os.path.join(self.project['modules-path'], dep_folder, 'dbt_project.yml'), self.project.profiles_dir, profile_to_load=self.project.profile_to_load) processed_repos.add(dep_folder) self.__pull_deps_recursive(dep_project['repositories'], processed_repos, i + 1) except IOError as e: if e.errno == errno.ENOENT: logger.info("'{}' is not a valid dbt project - " "dbt_project.yml not found".format(repo)) exit(1) else: raise e
def print_compile_stats(stats): names = { NodeType.Model: 'models', NodeType.Test: 'tests', NodeType.Archive: 'archives', NodeType.Analysis: 'analyses', NodeType.Macro: 'macros', NodeType.Operation: 'operations', } results = { NodeType.Model: 0, NodeType.Test: 0, NodeType.Archive: 0, NodeType.Analysis: 0, NodeType.Macro: 0, NodeType.Operation: 0, } results.update(stats) stat_line = ", ".join( ["{} {}".format(ct, names.get(t)) for t, ct in results.items()]) logger.info("Found {}".format(stat_line))
def compile(self): compiler = Compiler(self.project, self.args) compiler.initialize() compiled = compiler.compile() count_compiled_archives = compiled['archives'] logger.info("Compiled {} archives".format(count_compiled_archives))
def open(cls, connection): if connection.state == 'open': logger.debug('Connection is already open, skipping open.') return connection try: handle = cls.get_bigquery_client(connection.credentials) except google.auth.exceptions.DefaultCredentialsError as e: logger.info("Please log into GCP to continue") dbt.clients.gcloud.setup_default_credentials() handle = cls.get_bigquery_client(connection.credentials) except Exception as e: raise logger.debug("Got an error when attempting to create a bigquery " "client: '{}'".format(e)) connection.handle = None connection.state = 'fail' raise dbt.exceptions.FailedToConnectException(str(e)) connection.handle = handle connection.state = 'open' return connection
def find_schema_yml(cls, package_name, root_dir, relative_dirs): """This is common to both v1 and v2 - look through the relative_dirs under root_dir for .yml files yield pairs of filepath and loaded yaml contents. """ extension = "[!.#~]*.yml" file_matches = dbt.clients.system.find_matching( root_dir, relative_dirs, extension) for file_match in file_matches: file_contents = dbt.clients.system.load_file_contents( file_match.get('absolute_path'), strip=False) test_path = file_match.get('relative_path', '') original_file_path = os.path.join(file_match.get('searched_path'), test_path) try: test_yml = dbt.clients.yaml_helper.load_yaml_text( file_contents) except dbt.exceptions.ValidationException as e: test_yml = None logger.info("Error reading {}:{} - Skipping\n{}".format( package_name, test_path, e)) if test_yml is None: continue yield original_file_path, test_yml
def __pull_repo(self, repo, branch=None): modules_path = self.project['modules-path'] out, err = dbt.clients.git.clone(repo, modules_path) exists = re.match("fatal: destination path '(.+)' already exists", err.decode('utf-8')) folder = None start_sha = None if exists: folder = exists.group(1) logger.info('Updating existing dependency {}.'.format(folder)) else: matches = re.match("Cloning into '(.+)'", err.decode('utf-8')) folder = matches.group(1) logger.info('Pulling new dependency {}.'.format(folder)) dependency_path = os.path.join(modules_path, folder) start_sha = dbt.clients.git.get_current_sha(dependency_path) dbt.clients.git.checkout(dependency_path, branch) end_sha = dbt.clients.git.get_current_sha(dependency_path) if exists: if start_sha == end_sha: logger.info(' Already at {}, nothing to do.'.format( start_sha[:6])) else: logger.info(' Updated checkout from {} to {}.'.format( start_sha[:6], end_sha[:6])) else: logger.info(' Checked out at {}.'.format(end_sha[:6])) return folder
def path_info(self): open_cmd = dbt.clients.system.open_dir_cmd() message = PROFILE_DIR_MESSAGE.format(open_cmd=open_cmd, profiles_dir=self.profiles_dir) logger.info(message)
def safe_run_hooks(cls, project, adapter, flat_graph, hook_type): try: cls.run_hooks(project, adapter, flat_graph, hook_type) except dbt.exceptions.RuntimeException: logger.info("Database error while running {}".format(hook_type)) raise
def find_schema_yml(cls, package_name, root_dir, relative_dirs): """This is common to both v1 and v2 - look through the relative_dirs under root_dir for .yml files yield pairs of filepath and loaded yaml contents. """ extension = "[!.#~]*.yml" file_matches = dbt.clients.system.find_matching( root_dir, relative_dirs, extension) for file_match in file_matches: file_contents = dbt.clients.system.load_file_contents( file_match.get('absolute_path'), strip=False) test_path = file_match.get('relative_path', '') original_file_path = os.path.join(file_match.get('searched_path'), test_path) try: test_yml = dbt.clients.yaml_helper.load_yaml_text( file_contents ) except dbt.exceptions.ValidationException as e: test_yml = None logger.info("Error reading {}:{} - Skipping\n{}".format( package_name, test_path, e)) if test_yml is None: continue yield original_file_path, test_yml
def print_compile_stats(stats): names = { NodeType.Model: 'model', NodeType.Test: 'test', NodeType.Snapshot: 'snapshot', NodeType.Analysis: 'analysis', NodeType.Macro: 'macro', NodeType.Operation: 'operation', NodeType.Seed: 'seed file', NodeType.Source: 'source', NodeType.Exposure: 'exposure', } results = {k: 0 for k in names.keys()} results.update(stats) # create tracking event for resource_counts if dbt.tracking.active_user is not None: resource_counts = {k.pluralize(): v for k, v in results.items()} dbt.tracking.track_resource_counts(resource_counts) stat_line = ", ".join([ pluralize(ct, names.get(t)) for t, ct in results.items() if t in names ]) logger.info("Found {}".format(stat_line))
def safe_run_hooks(cls, config, adapter, manifest, hook_type): try: cls.run_hooks(config, adapter, manifest, hook_type) except dbt.exceptions.RuntimeException: logger.info("Database error while running {}".format(hook_type)) raise
def migrate_archive_ctas(self, dest): # get the columns columns = self.adapter.get_columns_in_relation(self.relation) if len(columns) == 0: # the archive target must not exist? Continue, that is ok. logger.info( ' - Table {} does not exist, nothing to migrate.'.format( self.relation)) return cols = {c.name.lower(): c.name for c in columns} renames = self.get_renamed_columns() select_parts = [] select_as_parts = [] for old, new in renames: key = old.strip('"').lower() if key not in cols: raise Exception( 'expected column like {} not but it is not in the table!'. format(key)) del cols[key] select_as_parts.append('{} as {}'.format(old, new)) for column in columns: name = column.name if name.lower() in cols: select_parts.append(self.adapter.quote(name)) selections = ', '.join(itertools.chain(select_parts, select_as_parts)) ctas = 'create table {!s} as (select {} from {!s})'.format( dest, selections, self.relation) self.adapter.execute(ctas)
def dependency_projects(project): module_paths = [ GLOBAL_DBT_MODULES_PATH, os.path.join(project['project-root'], project['modules-path']) ] for module_path in module_paths: logger.debug("Loading dependency project from {}".format(module_path)) for obj in os.listdir(module_path): full_obj = os.path.join(module_path, obj) if not os.path.isdir(full_obj) or obj.startswith('__'): # exclude non-dirs and dirs that start with __ # the latter could be something like __pycache__ # for the global dbt modules dir continue try: yield dbt.project.read_project( os.path.join(full_obj, 'dbt_project.yml'), project.profiles_dir, profile_to_load=project.profile_to_load, args=project.args) except dbt.project.DbtProjectError as e: logger.info( "Error reading dependency project at {}".format(full_obj)) logger.info(str(e))
def migrate_archive_snowflake(self): """Migrate the archive by create table as select ...""" logger.debug(' - Making new archive at {}'.format(self.backup)) self.migrate_archive_ctas(dest=self.backup) logger.info(' - Backing up table to {}'.format(self.backup)) self.adapter.execute('alter table {!s} swap with {!s}'.format( self.relation, self.backup))
def warn_if_useless_spec(spec, nodes): if len(nodes) > 0: return logger.info( "* Spec='{}' does not identify any models and was ignored\n".format( spec['raw']))
def main(args=None): if args is None: args = sys.argv[1:] with log_manager.applicationbound(): try: results, succeeded = handle_and_check(args) if succeeded: exit_code = ExitCodes.Success.value else: exit_code = ExitCodes.ModelError.value except KeyboardInterrupt: logger.info("ctrl-c") exit_code = ExitCodes.UnhandledError.value # This can be thrown by eg. argparse except SystemExit as e: exit_code = e.code except BaseException as e: logger.warning("Encountered an error:") logger.warning(str(e)) if log_manager.initialized: logger.debug(traceback.format_exc()) elif not isinstance(e, RuntimeException): # if it did not come from dbt proper and the logger is not # initialized (so there's no safe path to log to), log the # stack trace at error level. logger.error(traceback.format_exc()) exit_code = ExitCodes.UnhandledError.value sys.exit(exit_code)
def run_dbt_and_check(self, args=None, strict=True, parser=False, profiles_dir=True): log_manager.reset_handlers() if args is None: args = ["run"] final_args = [] if strict: final_args.append('--strict') if parser: final_args.append('--test-new-parser') if os.getenv('DBT_TEST_SINGLE_THREADED') in ('y', 'Y', '1'): final_args.append('--single-threaded') final_args.extend(args) if profiles_dir: final_args.extend(['--profiles-dir', self.test_root_dir]) final_args.append('--log-cache-events') logger.info("Invoking dbt with {}".format(final_args)) if args is None: args = ["run"] logger.info("Invoking dbt with {}".format(args)) return dbt.handle_and_check(final_args)
def run(self): self.compile() runner = RunManager( self.project, self.project['target-path'], self.args ) include = self.args.models exclude = self.args.exclude if (self.args.data and self.args.schema) or \ (not self.args.data and not self.args.schema): res = runner.run_tests(include, exclude, test_schemas=True, test_data=True) elif self.args.data: res = runner.run_tests(include, exclude, test_schemas=False, test_data=True) elif self.args.schema: res = runner.run_tests(include, exclude, test_schemas=True, test_data=False) else: raise RuntimeError("unexpected") logger.info("Done!") return res
def write_sources_for_downstream_project(sources_file_path, yml): """ Writes out the given schema file with the given string. """ logger.info("Creating sources file: {}".format(sources_file_path)) with open(sources_file_path, "w") as f: f.write(yml)
def open_connection(cls, connection): if connection.get('state') == 'open': logger.debug('Connection is already open, skipping open.') return connection result = connection.copy() credentials = connection.get('credentials', {}) try: handle = cls.get_bigquery_client(credentials) except google.auth.exceptions.DefaultCredentialsError as e: logger.info("Please log into GCP to continue") dbt.clients.gcloud.setup_default_credentials() handle = cls.get_bigquery_client(credentials) except Exception as e: raise logger.debug("Got an error when attempting to create a bigquery " "client: '{}'".format(e)) result['handle'] = None result['state'] = 'fail' raise dbt.exceptions.FailedToConnectException(str(e)) result['handle'] = handle result['state'] = 'open' return result
def open(cls, connection): if connection.state == 'open': logger.debug('Connection is already open, skipping open.') return connection try: handle = cls.get_bigquery_client(connection.credentials) except google.auth.exceptions.DefaultCredentialsError: logger.info("Please log into GCP to continue") gcloud.setup_default_credentials() handle = cls.get_bigquery_client(connection.credentials) except Exception as e: logger.debug("Got an error when attempting to create a bigquery " "client: '{}'".format(e)) connection.handle = None connection.state = 'fail' raise FailedToConnectException(str(e)) connection.handle = handle connection.state = 'open' return connection
def run(self): """ Run dbt for the query, based on the graph. """ self._runtime_initialize() if self._flattened_nodes is None: raise InternalException( 'after _runtime_initialize, _flattened_nodes was still None' ) if len(self._flattened_nodes) == 0: logger.warning("WARNING: Nothing to do. Try checking your model " "configs and model specification args") return self.get_result( results=[], generated_at=datetime.utcnow(), elapsed_time=0.0, ) else: with TextOnly(): logger.info("") selected_uids = frozenset(n.unique_id for n in self._flattened_nodes) result = self.execute_with_hooks(selected_uids) if flags.WRITE_JSON: self.write_manifest() self.write_result(result) self.task_end_messages(result.results) return result
def open(cls, connection): if connection.state == 'open': logger.debug('Connection is already open, skipping open.') return connection credentials = cls.get_credentials(connection.credentials) host = f'{credentials.host}:{credentials.port}/{credentials.database}' try: handle = cx_Oracle.connect(credentials.user, credentials.password, host, encoding="UTF-8") connection.handle = handle connection.state = 'open' except cx_Oracle.DatabaseError as e: logger.info("Got an error when attempting to open an oracle " "connection: '{}'".format(e)) connection.handle = None connection.state = 'fail' raise dbt.exceptions.FailedToConnectException(str(e)) return connection
def try_create_schema(self): profile = self.project.run_environment() adapter = get_adapter(profile) schema_name = adapter.get_default_schema(profile) model_name = None connection = adapter.begin(profile) schema_exists = adapter.check_schema_exists(profile, schema_name) adapter.commit(connection) if schema_exists: logger.debug('schema {} already exists -- ' 'not creating'.format(schema_name)) return try: connection = adapter.begin(profile) adapter.create_schema(profile, schema_name) adapter.commit(connection) except (dbt.exceptions.FailedToConnectException, psycopg2.OperationalError) as e: logger.info("ERROR: Could not connect to the target database. Try " "`dbt debug` for more information.") logger.info(str(e)) raise
def run_dbt_and_check(self, args=None): if args is None: args = ["run"] args = ["--strict"] + args logger.info("Invoking dbt with {}".format(args)) return dbt.handle_and_check(args)
def write_sql(self, raw_schema): """ Renders the SQL for this relation and writes out. """ relation_dict = self.prep_meta_data() if self.is_unmanaged: logger.info( "{}.{} is an unmanaged table, skipping SQL generation.".format( self.app, self.relation)) else: for view_type in ("SAFE", "PII"): if view_type == "SAFE": sql_path = os.path.join(self.app_path, self.app) else: sql_path = os.path.join( self.app_path, "{}_{}".format(self.app, view_type)) if not os.path.isdir(sql_path): os.mkdir(sql_path) model_name = self.get_model_name(view_type) sql_file_name = "{}.sql".format(model_name) sql_file_path = os.path.join(sql_path, sql_file_name) sql = self.render_sql(self.app, view_type, relation_dict, raw_schema, self.redactions) self.write_sql_file(sql_file_path, sql)
def run(self): runner = RunManager(self.project, self.project['target-path'], self.args) runner.compile_models(self.args.models, self.args.exclude) logger.info('Done.')
def safe_run_hooks(self, adapter, hook_type: RunHookType, extra_context: Dict[str, Any]) -> None: try: self.run_hooks(adapter, hook_type, extra_context) except dbt.exceptions.RuntimeException: logger.info("Database error while running {}".format(hook_type)) raise
def __init__(self, project, target_path, graph_type, args): self.project = project self.target_path = target_path self.graph_type = graph_type self.args = args self.target = dbt.targets.get_target(self.project.run_environment(), self.args.threads) if self.target.should_open_tunnel(): logger.info("Opening ssh tunnel to host {}... ".format( self.target.ssh_host), end="") sys.stdout.flush() self.target.open_tunnel_if_needed() logger.info("Connected") self.schema = dbt.schema.Schema(self.project, self.target) self.context = { "run_started_at": datetime.now(), "invocation_id": dbt.tracking.invocation_id, "get_columns_in_table": self.schema.get_columns_in_table, "get_missing_columns": self.schema.get_missing_columns, "already_exists": self.schema.table_exists, }
def run_from_args(parsed): log_cache_events(getattr(parsed, 'log_cache_events', False)) flags.set_from_args(parsed) parsed.cls.pre_init_hook(parsed) # we can now use the logger for stdout logger.info("Running with dbt{}".format(dbt.version.installed)) # this will convert DbtConfigErrors into RuntimeExceptions task = parsed.cls.from_args(args=parsed) logger.debug("running dbt with arguments {parsed}", parsed=str(parsed)) log_path = None if task.config is not None: log_path = getattr(task.config, 'log_path', None) # we can finally set the file logger up log_manager.set_path(log_path) if dbt.tracking.active_user is not None: # mypy appeasement, always true logger.debug("Tracking: {}".format(dbt.tracking.active_user.state())) results = None with track_run(task): results = task.run() return task, results
def run_dbt_and_check(self, args=None): if args is None: args = ["run"] args = ["--strict"] + args logger.info("Invoking dbt with {}".format(args)) return dbt.handle_and_check(args)
def run(self, schema, banned_column_names): # pylint: disable=arguments-differ """ Run the task. """ # Check for any non-word characters that might indicate a SQL injection attack if re.search("[^a-zA-Z0-9_]", schema): raise Exception( "Non-word character in schema name '{}'! Possible SQL injection?" .format(schema)) adapter = get_adapter(self.config) try: catalog = self.fetch_full_catalog(adapter, schema, banned_column_names) except Exception as e: # pylint: disable=broad-except # TODO: Catch a less-broad exception than Exception. if "Information schema query returned too much data" not in str(e): raise logger.info( "Schema too large to fetch at once, fetching by first letter instead." ) catalog = self.fetch_catalog_by_letter(adapter, schema, banned_column_names) return catalog
def path_info(self): open_cmd = dbt.clients.system.open_dir_cmd() message = PROFILE_DIR_MESSAGE.format( open_cmd=open_cmd, profiles_dir=self.profiles_dir ) logger.info(message)
def print_end_of_run_summary(num_errors, early_exit=False): if early_exit: message = yellow('Exited because of keyboard interrupt.') elif num_errors > 0: message = red('Completed with {} errors:'.format(num_errors)) else: message = green('Completed successfully') logger.info('') logger.info('{}'.format(message))
def print_run_status_line(results): stats = { 'error': 0, 'skip': 0, 'pass': 0, 'total': 0, } for r in results: result_type = interpret_run_result(r) stats[result_type] += 1 stats['total'] += 1 stats_line = "\nDone. PASS={pass} ERROR={error} SKIP={skip} TOTAL={total}" logger.info(stats_line.format(**stats))
def run_dbt(self, args=None, expect_pass=True, strict=True): if args is None: args = ["run"] if strict: args = ["--strict"] + args args.append('--log-cache-events') logger.info("Invoking dbt with {}".format(args)) res, success = dbt.handle_and_check(args) self.assertEqual( success, expect_pass, "dbt exit state did not match expected") return res
def print_compile_stats(stats): names = { NodeType.Model: 'models', NodeType.Test: 'tests', NodeType.Archive: 'archives', NodeType.Analysis: 'analyses', NodeType.Macro: 'macros', NodeType.Operation: 'operations', NodeType.Seed: 'seed files', NodeType.Source: 'sources', } results = {k: 0 for k in names.keys()} results.update(stats) stat_line = ", ".join( ["{} {}".format(ct, names.get(t)) for t, ct in results.items()]) logger.info("Found {}".format(stat_line))
def show_table(self, result): table = result.node.agate_table rand_table = table.order_by(lambda x: random.random()) schema = result.node.schema alias = result.node.alias header = "Random sample of table: {}.{}".format(schema, alias) logger.info("") logger.info(header) logger.info("-" * len(header)) rand_table.print_table(max_rows=10, max_columns=None) logger.info("")
def run(self): project_dir = self.args.project_name profiles_dir = dbt.config.PROFILES_DIR profiles_file = os.path.join(profiles_dir, 'profiles.yml') self.create_profiles_dir(profiles_dir) self.create_profiles_file(profiles_file) msg = "Creating dbt configuration folder at {}" logger.info(msg.format(profiles_dir)) if os.path.exists(project_dir): raise RuntimeError("directory {} already exists!".format( project_dir )) self.clone_starter_repo(project_dir) addendum = self.get_addendum(project_dir, profiles_dir) logger.info(addendum)
def run(self): os.chdir(self.config.target_path) port = self.args.port shutil.copyfile(DOCS_INDEX_FILE_PATH, 'index.html') logger.info("Serving docs at 0.0.0.0:{}".format(port)) logger.info( "To access from your browser, navigate to http://localhost:{}." .format(port) ) logger.info("Press Ctrl+C to exit.\n\n") httpd = TCPServer( ('0.0.0.0', port), SimpleHTTPRequestHandler ) try: webbrowser.open_new_tab('http://127.0.0.1:{}'.format(port)) except webbrowser.Error as e: pass try: httpd.serve_forever() # blocks finally: httpd.shutdown() httpd.server_close() return None
def print_fancy_output_line(msg, status, index, total, execution_time=None): if index is None or total is None: progress = '' else: progress = '{} of {} '.format(index, total) prefix = "{timestamp} | {progress}{message}".format( timestamp=get_timestamp(), progress=progress, message=msg) justified = prefix.ljust(80, ".") if execution_time is None: status_time = "" else: status_time = " in {execution_time:0.2f}s".format( execution_time=execution_time) status_txt = status output = "{justified} [{status}{status_time}]".format( justified=justified, status=status_txt, status_time=status_time) logger.info(output)
def run(self): """ Run dbt for the query, based on the graph. """ self._runtime_initialize() adapter = get_adapter(self.config) if len(self._flattened_nodes) == 0: logger.info("WARNING: Nothing to do. Try checking your model " "configs and model specification args") return [] else: logger.info("") selected_uids = frozenset(n.unique_id for n in self._flattened_nodes) try: self.before_hooks(adapter) started = time.time() self.before_run(adapter, selected_uids) res = self.execute_nodes() self.after_run(adapter, res) elapsed = time.time() - started self.after_hooks(adapter, res, elapsed) finally: adapter.cleanup_connections() result = self.get_result( results=res, elapsed_time=elapsed, generated_at=dbt.utils.timestring() ) result.write(self.result_path()) self.task_end_messages(res) return res
def get_nodes_from_spec(self, graph, spec): filter_map = { SELECTOR_FILTERS.FQN: self.get_nodes_by_qualified_name, SELECTOR_FILTERS.TAG: self.get_nodes_by_tag, SELECTOR_FILTERS.SOURCE: self.get_nodes_by_source, } filter_method = filter_map.get(spec.selector_type) if filter_method is None: valid_selectors = ", ".join(filter_map.keys()) logger.info("The '{}' selector specified in {} is invalid. Must " "be one of [{}]".format( spec.selector_type, spec.raw, valid_selectors)) return set() collected = set(filter_method(graph, spec.selector_value)) collected.update(self.collect_models(graph, collected, spec)) collected.update(self.collect_tests(graph, collected)) return collected
def print_run_result_error(result, newline=True): if newline: logger.info("") if result.failed: logger.info(yellow("Failure in {} {} ({})").format( result.node.get('resource_type'), result.node.get('name'), result.node.get('original_file_path'))) logger.info(" Got {} results, expected 0.".format(result.status)) if result.node.get('build_path') is not None: logger.info("") logger.info(" compiled SQL at {}".format( result.node.get('build_path'))) else: first = True for line in result.error.split("\n"): if first: logger.info(yellow(line)) first = False else: logger.info(line)
def print_timestamped_line(msg, use_color=None): if use_color is not None: msg = color(msg, use_color) logger.info("{} | {}".format(get_timestamp(), msg))
def safe_run_hooks(self, adapter, hook_type, extra_context): try: self.run_hooks(adapter, hook_type, extra_context) except dbt.exceptions.RuntimeException: logger.info("Database error while running {}".format(hook_type)) raise
def compiler_warning(model, msg, resource_type='model'): name = get_model_name_or_none(model) logger.info( "* Compilation warning while compiling {} {}:\n* {}\n" .format(resource_type, name, msg) )