def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent: """ Creates MetadataChangeEvent for the dataset, creating upstream lineage links """ logger.debug(f"looker_view = {looker_view.id}") dataset_snapshot = DatasetSnapshot( urn=looker_view.id.get_urn(self.source_config), aspects=[], # we append to this list later on ) browse_paths = BrowsePaths( paths=[looker_view.id.get_browse_path(self.source_config)]) dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(Status(removed=False)) upstream_lineage = self._get_upstream_lineage(looker_view) if upstream_lineage is not None: dataset_snapshot.aspects.append(upstream_lineage) schema_metadata = LookerUtil._get_schema( self.source_config.platform_name, looker_view.id.view_name, looker_view.fields, self.reporter, ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) dataset_snapshot.aspects.append( self._get_custom_properties(looker_view)) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return mce
def get_workunits(self) -> Iterable[MetadataWorkUnit]: dashboards = self.client.all_dashboards(fields="id") deleted_dashboards = ( self.client.search_dashboards(fields="id", deleted="true") if self.source_config.include_deleted else [] ) if deleted_dashboards != []: logger.debug(f"Deleted Dashboards = {deleted_dashboards}") dashboard_ids = [dashboard_base.id for dashboard_base in dashboards] dashboard_ids.extend( [deleted_dashboard.id for deleted_dashboard in deleted_dashboards] ) for dashboard_id in dashboard_ids: assert dashboard_id is not None self.reporter.report_dashboards_scanned() if not self.source_config.dashboard_pattern.allowed(dashboard_id): self.reporter.report_dashboards_dropped(dashboard_id) continue try: fields = [ "id", "title", "dashboard_elements", "dashboard_filters", "deleted", "description", "folder", "user_id", ] dashboard_object = self.client.dashboard( dashboard_id=dashboard_id, fields=",".join(fields) ) except SDKError: # A looker dashboard could be deleted in between the list and the get self.reporter.report_warning( dashboard_id, f"Error occurred while loading dashboard {dashboard_id}. Skipping.", ) continue if self.source_config.skip_personal_folders: if dashboard_object.folder is not None and ( dashboard_object.folder.is_personal or dashboard_object.folder.is_personal_descendant ): self.reporter.report_warning( dashboard_id, "Dropped due to being a personal folder" ) self.reporter.report_dashboards_dropped(dashboard_id) continue looker_dashboard = self._get_looker_dashboard(dashboard_object, self.client) mces = self._make_dashboard_and_chart_mces(looker_dashboard) for mce in mces: workunit = MetadataWorkUnit( id=f"looker-{mce.proposedSnapshot.urn}", mce=mce ) self.reporter.report_workunit(workunit) yield workunit if ( self.source_config.extract_owners and self.resolved_user_ids > 0 and self.email_ids_missing == self.resolved_user_ids ): # Looks like we tried to extract owners and could not find their email addresses. This is likely a permissions issue self.reporter.report_warning( "api", "Failed to extract owners emails for any dashboards. Please enable the see_users permission for your Looker API key", ) explore_events = self._make_explore_metadata_events() for event in explore_events: if isinstance(event, MetadataChangeEvent): workunit = MetadataWorkUnit( id=f"looker-{event.proposedSnapshot.urn}", mce=event ) elif isinstance(event, MetadataChangeProposalWrapper): # We want to treat subtype aspects as optional, so allowing failures in this aspect to be treated as warnings rather than failures workunit = MetadataWorkUnit( id=f"looker-{event.entityUrn}-{event.aspectName}", mcp=event, treat_errors_as_warnings=True if event.aspectName in ["subTypes"] else False, ) else: raise Exception("Unexpected type of event {}".format(event)) self.reporter.report_workunit(workunit) yield workunit if self.source_config.tag_measures_and_dimensions and explore_events != []: # Emit tag MCEs for measures and dimensions if we produced any explores: for tag_mce in LookerUtil.get_tag_mces(): workunit = MetadataWorkUnit( id=f"tag-{tag_mce.proposedSnapshot.urn}", mce=tag_mce, ) self.reporter.report_workunit(workunit) yield workunit
def get_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 viewfile_loader = LookerViewFileLoader( str(self.source_config.base_folder), self.reporter) # some views can be mentioned by multiple 'include' statements, so this set is used to prevent # creating duplicate MCE messages processed_view_files: Set[str] = set() # The ** means "this directory and all subdirectories", and hence should # include all the files we want. model_files = sorted( self.source_config.base_folder.glob("**/*.model.lkml")) model_suffix_len = len(".model") for file_path in model_files: self.reporter.report_models_scanned() model_name = file_path.stem[0:-model_suffix_len] if not self.source_config.model_pattern.allowed(model_name): self.reporter.report_models_dropped(model_name) continue try: logger.debug(f"Attempting to load model: {file_path}") model = self._load_model(str(file_path)) except Exception as e: self.reporter.report_warning( model_name, f"unable to load Looker model at {file_path}: {repr(e)}") continue assert model.connection is not None connectionDefinition = self._get_connection_def_based_on_connection_string( model.connection) if connectionDefinition is None: self.reporter.report_warning( f"model-{model_name}", f"Failed to load connection {model.connection}. Check your API key permissions.", ) self.reporter.report_models_dropped(model_name) continue project_name = self.get_project_name(model_name) for include in model.resolved_includes: logger.debug(f"Considering {include} for model {model_name}") if include in processed_view_files: logger.debug( f"view '{include}' already processed, skipping it") continue logger.debug(f"Attempting to load view file: {include}") looker_viewfile = viewfile_loader.load_viewfile( include, connectionDefinition, self.reporter) if looker_viewfile is not None: for raw_view in looker_viewfile.views: self.reporter.report_views_scanned() try: maybe_looker_view = LookerView.from_looker_dict( project_name, model_name, raw_view, connectionDefinition, looker_viewfile, viewfile_loader, self.reporter, self.source_config.parse_table_names_from_sql, self.source_config.sql_parser, ) except Exception as e: self.reporter.report_warning( include, f"unable to load Looker view {raw_view}: {repr(e)}", ) continue if maybe_looker_view: if self.source_config.view_pattern.allowed( maybe_looker_view.id.view_name): mce = self._build_dataset_mce( maybe_looker_view) workunit = MetadataWorkUnit( id=f"lookml-view-{maybe_looker_view.id}", mce=mce, ) self.reporter.report_workunit(workunit) processed_view_files.add(include) yield workunit for mcp in self._build_dataset_mcps( maybe_looker_view): # We want to treat mcp aspects as optional, so allowing failures in this aspect to be treated as warnings rather than failures workunit = MetadataWorkUnit( id= f"lookml-view-{mcp.aspectName}-{maybe_looker_view.id}", mcp=mcp, treat_errors_as_warnings=True, ) self.reporter.report_workunit(workunit) yield workunit else: self.reporter.report_views_dropped( str(maybe_looker_view.id)) if (self.source_config.tag_measures_and_dimensions and self.reporter.workunits_produced != 0): # Emit tag MCEs for measures and dimensions: for tag_mce in LookerUtil.get_tag_mces(): workunit = MetadataWorkUnit( id=f"tag-{tag_mce.proposedSnapshot.urn}", mce=tag_mce) self.reporter.report_workunit(workunit) yield workunit
def get_workunits(self) -> Iterable[MetadataWorkUnit]: dashboards = self.client.all_dashboards(fields="id") deleted_dashboards = (self.client.search_dashboards(fields="id", deleted="true") if self.source_config.include_deleted else []) if deleted_dashboards != []: logger.debug(f"Deleted Dashboards = {deleted_dashboards}") dashboard_ids = [dashboard_base.id for dashboard_base in dashboards] dashboard_ids.extend( [deleted_dashboard.id for deleted_dashboard in deleted_dashboards]) with concurrent.futures.ThreadPoolExecutor( max_workers=self.source_config.max_threads) as async_executor: async_workunits = [ async_executor.submit(self.process_dashboard, dashboard_id) for dashboard_id in dashboard_ids ] for async_workunit in concurrent.futures.as_completed( async_workunits): work_units, dashboard_id, start_time, end_time = async_workunit.result( ) logger.info( f"Running time of process_dashboard for {dashboard_id} = {(end_time-start_time).total_seconds()}" ) self.reporter.report_upstream_latency(start_time, end_time) for mwu in work_units: yield mwu self.reporter.report_workunit(mwu) if (self.source_config.extract_owners and self.resolved_user_ids > 0 and self.email_ids_missing == self.resolved_user_ids): # Looks like we tried to extract owners and could not find their email addresses. This is likely a permissions issue self.reporter.report_warning( "api", "Failed to extract owners emails for any dashboards. Please enable the see_users permission for your Looker API key", ) explore_events = self._make_explore_metadata_events() for event in explore_events: if isinstance(event, MetadataChangeEvent): workunit = MetadataWorkUnit( id=f"looker-{event.proposedSnapshot.urn}", mce=event) elif isinstance(event, MetadataChangeProposalWrapper): # We want to treat subtype aspects as optional, so allowing failures in this aspect to be treated as warnings rather than failures workunit = MetadataWorkUnit( id=f"looker-{event.entityUrn}-{event.aspectName}", mcp=event, treat_errors_as_warnings=True if event.aspectName in ["subTypes"] else False, ) else: raise Exception("Unexpected type of event {}".format(event)) self.reporter.report_workunit(workunit) yield workunit if self.source_config.tag_measures_and_dimensions and explore_events != []: # Emit tag MCEs for measures and dimensions if we produced any explores: for tag_mce in LookerUtil.get_tag_mces(): workunit = MetadataWorkUnit( id=f"tag-{tag_mce.proposedSnapshot.urn}", mce=tag_mce, ) self.reporter.report_workunit(workunit) yield workunit