def parse_project( self, project: Project, parser_files ) -> None: project_parser_info: List[ParserInfo] = [] start_timer = time.perf_counter() total_path_count = 0 # Loop through parsers with loaded files. Note: SchemaParser must be last parser_types: List[Type[Parser]] = [ ModelParser, SnapshotParser, AnalysisParser, DataTestParser, SeedParser, DocumentationParser, SchemaParser] for parser_cls in parser_types: parser_name = parser_cls.__name__ # No point in creating a parser if we don't have files for it if parser_name not in parser_files or not parser_files[parser_name]: continue # Initialize timing info parser_path_count = 0 parser_start_timer = time.perf_counter() # Parse the project files for this parser parser: Parser = parser_cls(project, self.manifest, self.root_project) for search_key in parser_files[parser_name]: block = FileBlock(self.manifest.files[search_key]) self.parse_with_cache(block, parser) parser_path_count = parser_path_count + 1 # Save timing info project_parser_info.append(ParserInfo( parser=parser.resource_type, path_count=parser_path_count, elapsed=time.perf_counter() - parser_start_timer )) total_path_count = total_path_count + parser_path_count # HookParser doesn't run from loaded files, just dbt_project.yml, # so do separately hook_parser = HookParser(project, self.manifest, self.root_project) path = hook_parser.get_path() file_block = FileBlock(load_source_file(path, ParseFileType.Hook, project.project_name)) self.parse_with_cache(file_block, hook_parser) # Store the performance info elapsed = time.perf_counter() - start_timer project_info = ProjectLoaderInfo( project_name=project.project_name, path_count=total_path_count, elapsed=elapsed, parsers=project_parser_info ) self._perf_info.projects.append(project_info) self._perf_info.path_count = ( self._perf_info.path_count + total_path_count )
def test_model_no_cache(self): source_file = self._matching_file('models', 'model_1.sql') self.parser.load_file.return_value = source_file self.loader.old_manifest = None self.loader.parse_with_cache(FileBlock(source_file), self.parser) # there was nothing in the cache, so parse_file should get called # with a FileBlock that has the given source file in it self.parser.parse_file.assert_called_once_with( FileBlock(file=source_file))
def _get_file(self, path: FilePath, parser: BaseParser) -> FileBlock: if path.search_key in self._loaded_file_cache: block = self._loaded_file_cache[path.search_key] else: block = FileBlock(file=parser.load_file(path)) self._loaded_file_cache[path.search_key] = block return block
def test_model_cache_mismatch_checksum(self): source_file = self._mismatched_file('models', 'model_1.sql') self.parser.load_file.return_value = source_file source_file_dupe = self._mismatched_file('models', 'model_1.sql') source_file_dupe.nodes.append('model.root.model_1') old_manifest = self._new_manifest() old_manifest.files[source_file_dupe.path.search_key] = source_file_dupe old_manifest.nodes = {'model.root.model_1': mock.MagicMock()} self.loader.old_manifest = old_manifest self.loader.parse_with_cache(FileBlock(source_file), self.parser) # there was a cache checksum mismatch, so parse_file should get called # with a FileBlock that has the given source file in it self.parser.parse_file.assert_called_once_with( FileBlock(file=source_file))
def _build_file(self, contents, relative_path) -> FileBlock: match = FilePath( relative_path=relative_path, project_root=self.root_path, searched_path=self.subdir_path, ) source_file = SourceFile(path=match, checksum=FileHash.empty()) source_file.contents = contents return FileBlock(file=source_file)
def test_model_cache_missing_file(self): source_file = self._matching_file('models', 'model_1.sql') self.parser.load_file.return_value = source_file source_file_different = self._matching_file('models', 'model_2.sql') source_file_different.nodes.append('model.root.model_2') old_manifest = self._new_manifest() old_manifest.files[ source_file_different.path.search_key] = source_file_different old_manifest.nodes = {'model.root.model_2': mock.MagicMock()} self.loader.old_manifest = old_manifest self.loader.parse_with_cache(FileBlock(source_file), self.parser) # the filename wasn't in the cache, so parse_file should get called # with a FileBlock that has the given source file in it. self.parser.parse_file.assert_called_once_with( FileBlock(file=source_file))
def create_macro_manifest(self): for project in self.all_projects.values(): # what is the manifest passed in actually used for? macro_parser = MacroParser(project, self.manifest) for path in macro_parser.get_paths(): source_file = load_source_file( path, ParseFileType.Macro, project.project_name) block = FileBlock(source_file) # This does not add the file to the manifest.files, # but that shouldn't be necessary here. self.parse_with_cache(block, macro_parser) macro_manifest = MacroManifest(self.manifest.macros) return macro_manifest
def file_block_for(self, data: str, filename: str, searched: str): root_dir = get_abs_os_path('./dbt_modules/snowplow') filename = normalize(filename) path = FilePath( searched_path=searched, relative_path=filename, project_root=root_dir, ) source_file = SourceFile( path=path, checksum=FileHash.from_contents(data), ) source_file.contents = data return FileBlock(file=source_file)
def test_model_cache_hit(self): source_file = self._matching_file('models', 'model_1.sql') self.parser.load_file.return_value = source_file source_file_dupe = self._matching_file('models', 'model_1.sql') source_file_dupe.nodes.append('model.root.model_1') old_manifest = self._new_manifest() old_manifest.files[source_file_dupe.path.search_key] = source_file_dupe self.loader.old_manifest = old_manifest self.loader.old_manifest.nodes = { 'model.root.model_1': mock.MagicMock() } self.loader.parse_with_cache(FileBlock(source_file), self.parser) # there was a cache hit, so parse_file should never have been called self.parser.parse_file.assert_not_called()
def load(self): if self.old_manifest is not None: logger.debug('Got an acceptable saved parse result') # Read files creates a dictionary of projects to a dictionary # of parsers to lists of file strings. The file strings are # used to get the SourceFiles from the manifest files. # In the future the loaded files will be used to control # partial parsing, but right now we're just moving the # file loading out of the individual parsers and doing it # all at once. start_read_files = time.perf_counter() project_parser_files = {} for project in self.all_projects.values(): read_files(project, self.manifest.files, project_parser_files) self._perf_info.read_files_elapsed = (time.perf_counter() - start_read_files) # We need to parse the macros first, so they're resolvable when # the other files are loaded start_load_macros = time.perf_counter() for project in self.all_projects.values(): parser = MacroParser(project, self.manifest) parser_files = project_parser_files[project.project_name] for search_key in parser_files['MacroParser']: block = FileBlock(self.manifest.files[search_key]) self.parse_with_cache(block, parser) self.reparse_macros() # This is where a loop over self.manifest.macros should be performed # to set the 'depends_on' information from static rendering. self._perf_info.load_macros_elapsed = (time.perf_counter() - start_load_macros) # Now that the macros are parsed, parse the rest of the files. # This is currently done on a per project basis, # but in the future we may change that start_parse_projects = time.perf_counter() for project in self.all_projects.values(): self.parse_project(project, project_parser_files[project.project_name]) self._perf_info.parse_project_elapsed = (time.perf_counter() - start_parse_projects)
def parse_file_from_path(self, path: FilePath): block = FileBlock(file=self.load_file(path)) self.parse_file(block)