def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent: """ Creates MetadataChangeEvent for the dataset, creating upstream lineage links """ logger.debug(f"looker_view = {looker_view.view_name}") dataset_name = looker_view.view_name actor = self.source_config.actor sys_time = get_sys_time() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{self.source_config.platform_name},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( self._get_upsteam_lineage(looker_view, actor, sys_time)) dataset_snapshot.aspects.append( self._get_schema(looker_view, actor, sys_time)) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) return mce
def get_schema_metadata(report: SourceReport, node: DBTNode, platform: str) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in node.columns: field = SchemaField( fieldPath=column.name, nativeDataType=column.data_type, type=get_column_type(report, node.dbt_name, column.data_type), description=column.comment, nullable=False, # TODO: actually autodetect this recursive=False, ) canonical_schema.append(field) actor = "urn:li:corpuser:dbt_executor" sys_time = get_sys_time() last_modified = sys_time if node.max_loaded_at is not None: last_modified = int( dateutil.parser.parse(node.max_loaded_at).timestamp() * 1000) return SchemaMetadata( schemaName=node.dbt_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=last_modified, actor=actor), fields=canonical_schema, )
def _make_chart_mce( self, dashboard_element: LookerDashboardElement) -> MetadataChangeEvent: actor = self.source_config.actor sys_time = get_sys_time() chart_urn = f"urn:li:chart:({self.source_config.platform_name},{dashboard_element.get_urn_element_id()})" chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[], ) last_modified = ChangeAuditStamps( created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) chart_type = self._get_chart_type(dashboard_element) chart_info = ChartInfoClass( type=chart_type, description=dashboard_element.description if dashboard_element.description is not None else "", title=dashboard_element.title if dashboard_element.title is not None else "", lastModified=last_modified, chartUrl=dashboard_element.url(self.source_config.base_url), inputs=dashboard_element.get_view_urns( self.source_config.platform_name), ) chart_snapshot.aspects.append(chart_info) return MetadataChangeEvent(proposedSnapshot=chart_snapshot)
def get_schema_metadata(sql_report: SQLSourceReport, dataset_name: str, platform: str, columns: List[dict]) -> SchemaMetadata: canonical_schema: List[SchemaField] = [] for column in columns: field = SchemaField( fieldPath=column["name"], type=get_column_type(sql_report, dataset_name, column["type"]), nativeDataType=column.get("full_type", repr(column["type"])), description=column.get("comment", None), nullable=column["nullable"], recursive=False, ) canonical_schema.append(field) actor = "urn:li:corpuser:etl" sys_time = get_sys_time() schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=MySqlDDL(tableSchema=""), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) return schema_metadata
def get_records( self, workunit: WorkUnit ) -> Iterable[RecordEnvelope[ Union[MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregationClass, ]]]: if isinstance(workunit, MetadataWorkUnit): if isinstance( workunit.metadata, ( MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, ), ): workunit.metadata.systemMetadata = SystemMetadata( lastObserved=get_sys_time(), runId=self.ctx.run_id) if (isinstance(workunit.metadata, MetadataChangeEvent) and len( workunit.metadata.proposedSnapshot.aspects) == 0): raise AttributeError( "every mce must have at least one aspect") if not workunit.metadata.validate(): invalid_mce = str(workunit.metadata) if black is not None: invalid_mce = black.format_str(invalid_mce, mode=black.FileMode()) raise ValueError( f"source produced an invalid metadata work unit: {invalid_mce}" ) yield RecordEnvelope( workunit.metadata, { "workunit_id": workunit.id, }, ) elif isinstance(workunit, UsageStatsWorkUnit): if not workunit.usageStats.validate(): invalid_usage_stats = str(workunit.usageStats) if black is not None: invalid_usage_stats = black.format_str( invalid_usage_stats, mode=black.FileMode()) raise ValueError( f"source produced an invalid usage stat: {invalid_usage_stats}" ) yield RecordEnvelope( workunit.usageStats, { "workunit_id": workunit.id, }, ) else: raise ValueError(f"unknown WorkUnit type {type(workunit)}")
def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic actor = "urn:li:corpuser:etl" sys_time = get_sys_time() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) # Fetch schema from the registry. has_schema = True try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get schema: {e}") has_schema = False # Parse the schema fields: List[SchemaField] = [] if has_schema and schema.schema_type == "AVRO": fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif has_schema: self.report.report_warning( topic, f"unable to parse kafka schema type {schema.schema_type}") if has_schema: schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=str(schema._hash), platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema(documentSchema=schema.schema_str), fields=fields, created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dataset_snapshot.aspects.append(schema_metadata) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def get_upstream_lineage(upstream_urns: List[str]) -> UpstreamLineage: ucl: List[UpstreamClass] = [] actor = "urn:li:corpuser:dbt_executor" sys_time = get_sys_time() for dep in upstream_urns: uc = UpstreamClass( dataset=dep, auditStamp=AuditStamp(actor=actor, time=sys_time), type=DatasetLineageTypeClass.TRANSFORMED, ) ucl.append(uc) return UpstreamLineage(upstreams=ucl)
def _make_dashboard_and_chart_mces( self, looker_dashboard: LookerDashboard ) -> List[MetadataChangeEvent]: actor = self.source_config.actor sys_time = get_sys_time() chart_mces = [ self._make_chart_mce(element) for element in looker_dashboard.dashboard_elements ] dashboard_urn = f"urn:li:dashboard:({self.source_config.platform_name},{looker_dashboard.get_urn_dashboard_id()})" dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) last_modified = ChangeAuditStamps( created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dashboard_info = DashboardInfoClass( description=looker_dashboard.description if looker_dashboard.description is not None else "", title=looker_dashboard.title, charts=[mce.proposedSnapshot.urn for mce in chart_mces], lastModified=last_modified, dashboardUrl=looker_dashboard.url(self.source_config.base_url), ) dashboard_snapshot.aspects.append(dashboard_info) owners = [OwnerClass(owner=actor, type=OwnershipTypeClass.DATAOWNER)] dashboard_snapshot.aspects.append( OwnershipClass( owners=owners, lastModified=AuditStampClass( time=sys_time, actor=self.source_config.actor ), ) ) dashboard_snapshot.aspects.append(Status(removed=looker_dashboard.is_deleted)) dashboard_mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot) return chart_mces + [dashboard_mce]
def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): return mce terms_to_add = self.config.get_terms_to_add(mce.proposedSnapshot) if terms_to_add: terms = builder.get_or_add_aspect( mce, GlossaryTermsClass( terms=[], auditStamp=AuditStampClass( time=builder.get_sys_time(), actor="urn:li:corpUser:restEmitter" ), ), ) terms.terms.extend(terms_to_add) return mce
def _convert_evrs_to_profile(self, evrs: ExpectationSuiteValidationResult, pretty_name: str) -> DatasetProfileClass: profile = DatasetProfileClass(timestampMillis=get_sys_time()) for col, evrs_for_col in groupby_unsorted( evrs.results, key=self._get_column_from_evr): if col is None: self._handle_convert_table_evrs(profile, evrs_for_col, pretty_name=pretty_name) else: self._handle_convert_column_evrs(profile, col, evrs_for_col, pretty_name=pretty_name) return profile
def transform_one( self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): return mce owners_to_add = self.config.get_owners_to_add(mce.proposedSnapshot) if owners_to_add: ownership = builder.get_or_add_aspect( mce, OwnershipClass( owners=[], lastModified=AuditStampClass( time=builder.get_sys_time(), actor=self.config.default_actor, ), ), ) ownership.owners.extend(owners_to_add) return mce
def construct_lineage_workunits( self, connector: ConnectorManifest ) -> Iterable[MetadataWorkUnit]: lineages = connector.lineages if lineages: for lineage in lineages: source_dataset = lineage.source_dataset source_platform = lineage.source_platform target_dataset = lineage.target_dataset target_platform = lineage.target_platform mce = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=builder.make_dataset_urn( target_platform, target_dataset, self.config.env ), aspects=[ models.UpstreamLineageClass( upstreams=[ models.UpstreamClass( dataset=builder.make_dataset_urn( source_platform, source_dataset, self.config.env, ), type=models.DatasetLineageTypeClass.TRANSFORMED, auditStamp=models.AuditStampClass( time=builder.get_sys_time(), actor="urn:li:corpuser:datahub", ), ) ] ) ], ) ) wu = MetadataWorkUnit(id=source_dataset, mce=mce) self.report.report_workunit(wu) yield wu
def get_records( self, workunit: WorkUnit ) -> Iterable[RecordEnvelope[ Union[MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper, UsageAggregationClass, ]]]: if isinstance(workunit, MetadataWorkUnit): if isinstance(workunit.metadata, MetadataChangeEvent): mce = workunit.metadata mce.systemMetadata = SystemMetadata( lastObserved=get_sys_time(), runId=self.ctx.run_id) if len(mce.proposedSnapshot.aspects) == 0: raise AttributeError( "every mce must have at least one aspect") if not workunit.metadata.validate(): raise ValueError( f"source produced an invalid metadata work unit: {workunit.metadata}" ) yield RecordEnvelope( workunit.metadata, { "workunit_id": workunit.id, }, ) elif isinstance(workunit, UsageStatsWorkUnit): if not workunit.usageStats.validate(): raise ValueError( f"source produced an invalid usage stat: {workunit.usageStats}" ) yield RecordEnvelope( workunit.usageStats, { "workunit_id": workunit.id, }, ) else: raise ValueError(f"unknown WorkUnit type {type(workunit)}")
make_dataset_urn_with_platform_instance, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, config_class, platform_name, support_status, ) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit logger = logging.getLogger(__name__) auditStamp = models.AuditStampClass(time=get_sys_time(), actor="urn:li:corpUser:pythonEmitter") class EntityConfig(EnvBasedSourceConfigBase): name: str type: str platform: str platform_instance: Optional[str] @validator("type") def type_must_be_supported(cls, v: str) -> str: allowed_types = ["dataset"] if v not in allowed_types: raise ConfigurationError( f"Type must be one of {allowed_types}, {v} is not yet supported."
def test_simple_dataset_ownership_tranformation(mock_time): no_owner_aspect = make_generic_dataset() with_owner_aspect = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", aspects=[ models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn("fake_owner"), type=models.OwnershipTypeClass.DATAOWNER, ), ], lastModified=models.AuditStampClass( time=builder.get_sys_time(), actor="urn:li:corpuser:datahub" ), ) ], ), ) not_a_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", aspects=[ models.DataJobInfoClass( name="User Deletions", description="Constructs the fct_users_deleted from logging_events", type=models.AzkabanJobTypeClass.SQL, ) ], ) ) inputs = [ no_owner_aspect, with_owner_aspect, not_a_dataset, ] transformer = SimpleAddDatasetOwnership.create( { "owner_urns": [ builder.make_user_urn("person1"), builder.make_user_urn("person2"), ] }, PipelineContext(run_id="test"), ) outputs = list( transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) ) assert len(outputs) == len(inputs) # Check the first entry. first_ownership_aspect = builder.get_aspect_if_available( outputs[0].record, models.OwnershipClass ) assert first_ownership_aspect assert len(first_ownership_aspect.owners) == 2 # Check the second entry. second_ownership_aspect = builder.get_aspect_if_available( outputs[1].record, models.OwnershipClass ) assert second_ownership_aspect assert len(second_ownership_aspect.owners) == 3 # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record
def __init__( self, dataframe: DataFrame, spark: SparkSession, profiling_config: DataLakeProfilerConfig, report: DataLakeSourceReport, file_path: str, ): self.spark = spark self.dataframe = dataframe self.analyzer = AnalysisRunner(spark).onData(dataframe) self.column_specs = [] self.row_count = dataframe.count() self.profiling_config = profiling_config self.file_path = file_path self.columns_to_profile = [] self.ignored_columns = [] self.profile = DatasetProfileClass(timestampMillis=get_sys_time()) self.report = report self.profile.rowCount = self.row_count self.profile.columnCount = len(dataframe.columns) column_types = {x.name: x.dataType for x in dataframe.schema.fields} if self.profiling_config.profile_table_level_only: return # get column distinct counts for column in dataframe.columns: if not self.profiling_config.allow_deny_patterns.allowed(column): self.ignored_columns.append(column) continue self.columns_to_profile.append(column) # Normal CountDistinct is ridiculously slow self.analyzer.addAnalyzer(ApproxCountDistinct(column)) if self.profiling_config.max_number_of_fields_to_profile is not None: if (len(self.columns_to_profile) > self.profiling_config.max_number_of_fields_to_profile): columns_being_dropped = self.columns_to_profile[ self.profiling_config.max_number_of_fields_to_profile:] self.columns_to_profile = self.columns_to_profile[:self. profiling_config . max_number_of_fields_to_profile] self.report.report_file_dropped( f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})" ) analysis_result = self.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsJson( self.spark, analysis_result) # reshape distinct counts into dictionary column_distinct_counts = { x["instance"]: int(x["value"]) for x in analysis_metrics if x["name"] == "ApproxCountDistinct" } select_numeric_null_counts = [ count(when( isnan(c) | col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] in [DoubleType, FloatType] ] # PySpark doesn't support isnan() on non-float/double columns select_nonnumeric_null_counts = [ count(when( col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] not in [DoubleType, FloatType] ] null_counts = dataframe.select(select_numeric_null_counts + select_nonnumeric_null_counts) column_null_counts = null_counts.toPandas().T[0].to_dict() column_null_fractions = { c: column_null_counts[c] / self.row_count for c in self.columns_to_profile } column_nonnull_counts = { c: self.row_count - column_null_counts[c] for c in self.columns_to_profile } column_unique_proportions = { c: (column_distinct_counts[c] / column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0) for c in self.columns_to_profile } if self.profiling_config.include_field_sample_values: # take sample and convert to Pandas DataFrame if self.row_count < NUM_SAMPLE_ROWS: # if row count is less than number to sample, just take all rows rdd_sample = dataframe.rdd.take(self.row_count) else: rdd_sample = dataframe.rdd.takeSample(False, NUM_SAMPLE_ROWS, seed=0) # init column specs with profiles for column in self.columns_to_profile: column_profile = DatasetFieldProfileClass(fieldPath=column) column_spec = _SingleColumnSpec(column, column_profile) column_profile.uniqueCount = column_distinct_counts.get(column) column_profile.uniqueProportion = column_unique_proportions.get( column) column_profile.nullCount = column_null_counts.get(column) column_profile.nullProportion = column_null_fractions.get(column) if self.profiling_config.include_field_sample_values: column_profile.sampleValues = [ str(x[column]) for x in rdd_sample ] column_spec.type_ = column_types[column] column_spec.cardinality = _convert_to_cardinality( column_distinct_counts[column], column_null_fractions[column], ) self.column_specs.append(column_spec)
from datahub.emitter.mce_builder import get_sys_time, make_group_urn, make_user_urn from datahub.ingestion.api.decorators import ( # SourceCapability,; capability, SupportStatus, config_class, platform_name, support_status, ) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit logger = logging.getLogger(__name__) valid_status: models.StatusClass = models.StatusClass(removed=False) auditStamp = models.AuditStampClass( time=get_sys_time(), actor="urn:li:corpUser:restEmitter" ) class Owners(ConfigModel): users: Optional[List[str]] groups: Optional[List[str]] class GlossaryTermConfig(ConfigModel): name: str description: str term_source: Optional[str] source_ref: Optional[str] source_url: Optional[str] owners: Optional[Owners]
def process_dataflow_node( self, node: Dict[str, Any], flow_urn: str, new_dataset_ids: List[str], new_dataset_mces: List[MetadataChangeEvent], s3_formats: typing.DefaultDict[str, Set[Union[str, None]]], ) -> Dict[str, Any]: node_type = node["NodeType"] # for nodes representing datasets, we construct a dataset URN accordingly if node_type in ["DataSource", "DataSink"]: node_args = { x["Name"]: json.loads(x["Value"]) for x in node["Args"] } # if data object is Glue table if "database" in node_args and "table_name" in node_args: full_table_name = f"{node_args['database']}.{node_args['table_name']}" # we know that the table will already be covered when ingesting Glue tables node_urn = f"urn:li:dataset:(urn:li:dataPlatform:glue,{full_table_name},{self.env})" # if data object is S3 bucket elif node_args.get("connection_type") == "s3": # remove S3 prefix (s3://) s3_name = node_args["connection_options"]["path"][5:] if s3_name.endswith("/"): s3_name = s3_name[:-1] # append S3 format if different ones exist if len(s3_formats[s3_name]) > 1: node_urn = f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name}_{node_args.get('format')},{self.env})" else: node_urn = ( f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{self.env})" ) dataset_snapshot = DatasetSnapshot( urn=node_urn, aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append( OwnershipClass( owners=[], lastModified=AuditStampClass( time=mce_builder.get_sys_time(), actor="urn:li:corpuser:datahub", ), )) dataset_snapshot.aspects.append( DatasetPropertiesClass( customProperties={ k: str(v) for k, v in node_args.items() }, tags=[], )) new_dataset_mces.append( MetadataChangeEvent(proposedSnapshot=dataset_snapshot)) new_dataset_ids.append(f"{node['NodeType']}-{node['Id']}") else: raise ValueError( f"Unrecognized Glue data object type: {node_args}") # otherwise, a node represents a transformation else: node_urn = mce_builder.make_data_job_urn_with_flow( flow_urn, job_id=f'{node["NodeType"]}-{node["Id"]}') return { **node, "urn": node_urn, # to be filled in after traversing edges "inputDatajobs": [], "inputDatasets": [], "outputDatasets": [], }
def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent: def get_owner(time: int) -> OwnershipClass: owner = table.get("Owner") if owner: owners = [ OwnerClass( owner=f"urn:li:corpuser:{owner}", type=OwnershipTypeClass.DATAOWNER, ) ] else: owners = [] return OwnershipClass( owners=owners, lastModified=AuditStampClass( time=time, actor="urn:li:corpuser:datahub", ), ) def get_dataset_properties() -> DatasetPropertiesClass: return DatasetPropertiesClass( description=table.get("Description"), customProperties={ **table.get("Parameters", {}), **{ k: str(v) for k, v in table["StorageDescriptor"].items() if k not in [ "Columns", "Parameters" ] }, }, uri=table.get("Location"), tags=[], ) def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: schema = table["StorageDescriptor"]["Columns"] fields: List[SchemaField] = [] for field in schema: schema_field = SchemaField( fieldPath=field["Name"], nativeDataType=field["Type"], type=get_column_type(glue_source, field["Type"], table_name, field["Name"]), description=field.get("Comment"), recursive=False, nullable=True, ) fields.append(schema_field) return SchemaMetadata( schemaName=table_name, version=0, fields=fields, platform="urn:li:dataPlatform:glue", created=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), lastModified=AuditStamp(time=sys_time, actor="urn:li:corpuser:etl"), hash="", platformSchema=MySqlDDL(tableSchema=""), ) sys_time = mce_builder.get_sys_time() dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:glue,{table_name},{self.env})", aspects=[], ) dataset_snapshot.aspects.append(Status(removed=False)) dataset_snapshot.aspects.append(get_owner(sys_time)) dataset_snapshot.aspects.append(get_dataset_properties()) dataset_snapshot.aspects.append(get_schema_metadata(self)) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record
def generate_dataset_profile( # noqa: C901 (complexity) self, ) -> DatasetProfileClass: self.dataset.set_default_expectation_argument( "catch_exceptions", self.config.catch_exceptions) profile = DatasetProfileClass(timestampMillis=get_sys_time()) if self.partition: profile.partitionSpec = PartitionSpecClass( partition=self.partition) profile.fieldProfiles = [] self._get_dataset_rows(profile) all_columns = self.dataset.get_table_columns() profile.columnCount = len(all_columns) columns_to_profile = set(self._get_columns_to_profile()) logger.debug( f"profiling {self.dataset_name}: flushing stage 1 queries") self.query_combiner.flush() columns_profiling_queue: List[_SingleColumnSpec] = [] for column in all_columns: column_profile = DatasetFieldProfileClass(fieldPath=column) profile.fieldProfiles.append(column_profile) if column in columns_to_profile: column_spec = _SingleColumnSpec(column, column_profile) columns_profiling_queue.append(column_spec) self._get_column_type(column_spec, column) self._get_column_cardinality(column_spec, column) logger.debug( f"profiling {self.dataset_name}: flushing stage 2 queries") self.query_combiner.flush() assert profile.rowCount is not None row_count: int = profile.rowCount telemetry.telemetry_instance.ping( "profile_sql_table", # bucket by taking floor of log of the number of rows scanned { "rows_profiled": 10**int(log10(row_count + 1)), }, ) for column_spec in columns_profiling_queue: column = column_spec.column column_profile = column_spec.column_profile type_ = column_spec.type_ cardinality = column_spec.cardinality non_null_count = column_spec.nonnull_count unique_count = column_spec.unique_count if self.config.include_field_null_count and non_null_count is not None: null_count = row_count - non_null_count if null_count < 0: null_count = 0 column_profile.nullCount = null_count if row_count > 0: column_profile.nullProportion = null_count / row_count # Sometimes this value is bigger than 1 because of the approx queries if column_profile.nullProportion > 1: column_profile.nullProportion = 1 if unique_count is not None: column_profile.uniqueCount = unique_count if non_null_count is not None and non_null_count > 0: column_profile.uniqueProportion = unique_count / non_null_count # Sometimes this value is bigger than 1 because of the approx queries if column_profile.uniqueProportion > 1: column_profile.uniqueProportion = 1 self._get_dataset_column_sample_values(column_profile, column) if (type_ == ProfilerDataType.INT or type_ == ProfilerDataType.FLOAT or type_ == ProfilerDataType.NUMERIC): if cardinality == Cardinality.UNIQUE: pass elif cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, Cardinality.MANY, Cardinality.VERY_MANY, Cardinality.UNIQUE, ]: self._get_dataset_column_min(column_profile, column) self._get_dataset_column_max(column_profile, column) self._get_dataset_column_mean(column_profile, column) self._get_dataset_column_median(column_profile, column) if type_ == ProfilerDataType.INT: self._get_dataset_column_stdev(column_profile, column) self._get_dataset_column_quantiles(column_profile, column) self._get_dataset_column_histogram(column_profile, column) if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) else: # unknown cardinality - skip pass elif type_ == ProfilerDataType.STRING: if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) elif type_ == ProfilerDataType.DATETIME: self._get_dataset_column_min(column_profile, column) self._get_dataset_column_max(column_profile, column) # FIXME: Re-add histogram once kl_divergence has been modified to support datetimes if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) else: if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) logger.debug( f"profiling {self.dataset_name}: flushing stage 3 queries") self.query_combiner.flush() return profile
def get_workunits(self) -> Iterable[MetadataWorkUnit]: platform = "mongodb" database_names: List[str] = self.mongo_client.list_database_names() # traverse databases in sorted order so output is consistent for database_name in sorted(database_names): if database_name in DENY_DATABASE_LIST: continue if not self.config.database_pattern.allowed(database_name): self.report.report_dropped(database_name) continue database = self.mongo_client[database_name] collection_names: List[str] = database.list_collection_names() # traverse collections in sorted order so output is consistent for collection_name in sorted(collection_names): dataset_name = f"{database_name}.{collection_name}" if not self.config.collection_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) continue dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})", aspects=[], ) dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: collection_schema = construct_schema_pymongo( database[collection_name], delimiter=".", use_random_sampling=self.config.useRandomSampling, sample_size=self.config.schemaSamplingSize, ) # initialize the schema for the collection canonical_schema: List[SchemaField] = [] # append each schema field (sort so output is consistent) for schema_field in sorted( collection_schema.values(), key=lambda x: x["delimited_name"]): field = SchemaField( fieldPath=schema_field["delimited_name"], nativeDataType=self.get_pymongo_type_string( schema_field["type"], dataset_name), type=self.get_field_type(schema_field["type"], dataset_name), description=None, nullable=schema_field["nullable"], recursive=False, ) canonical_schema.append(field) # create schema metadata object for collection actor = "urn:li:corpuser:etl" sys_time = get_sys_time() schema_metadata = SchemaMetadata( schemaName=collection_name, platform=f"urn:li:dataPlatform:{platform}", version=0, hash="", platformSchema=SchemalessClass(), created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), fields=canonical_schema, ) dataset_snapshot.aspects.append(schema_metadata) # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu