Exemplo n.º 1
0
 def Compliance(self, instance, predicate, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Compliance(instance, predicate, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
Exemplo n.º 2
0
 def UniqueValueRatio(self, columns, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(UniqueValueRatio(columns, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
Exemplo n.º 3
0
 def ApproxQuantiles(self, column, quantiles):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(ApproxQuantiles(column, quantiles)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
Exemplo n.º 4
0
 def test_PatternMatch(self):
     result = (
         self.AnalysisRunner.onData(self.df).addAnalyzer(PatternMatch(column="a", pattern_regex="ba(r|z)")).run()
     )
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     self.assertEqual(result_df.select("value").collect(), [Row(value=0.0)])
Exemplo n.º 5
0
 def Histogram_maxBins(self, column, binningUdf=None, maxDetailBins: int = None, where: str = None):
     result = (
         self.AnalysisRunner.onData(self.df).addAnalyzer(Histogram(column, binningUdf, maxDetailBins, where)).run()
     )
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
Exemplo n.º 6
0
 def test_KLLSketch(self):
     result = (
         self.AnalysisRunner.onData(self.df).addAnalyzer(KLLSketch("b", KLLParameters(self.spark, 2, 0.64, 2))).run()
     )
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_df.show()
     return result_df.select("value").collect()
Exemplo n.º 7
0
    def __init__(
        self,
        dataframe: DataFrame,
        spark: SparkSession,
        profiling_config: DataLakeProfilerConfig,
        report: DataLakeSourceReport,
        file_path: str,
    ):
        self.spark = spark
        self.dataframe = dataframe
        self.analyzer = AnalysisRunner(spark).onData(dataframe)
        self.column_specs = []
        self.row_count = dataframe.count()
        self.profiling_config = profiling_config
        self.file_path = file_path
        self.columns_to_profile = []
        self.ignored_columns = []
        self.profile = DatasetProfileClass(timestampMillis=get_sys_time())
        self.report = report

        self.profile.rowCount = self.row_count
        self.profile.columnCount = len(dataframe.columns)

        column_types = {x.name: x.dataType for x in dataframe.schema.fields}

        if self.profiling_config.profile_table_level_only:

            return

        # get column distinct counts
        for column in dataframe.columns:

            if not self.profiling_config.allow_deny_patterns.allowed(column):
                self.ignored_columns.append(column)
                continue

            self.columns_to_profile.append(column)
            # Normal CountDistinct is ridiculously slow
            self.analyzer.addAnalyzer(ApproxCountDistinct(column))

        if self.profiling_config.max_number_of_fields_to_profile is not None:
            if (len(self.columns_to_profile) >
                    self.profiling_config.max_number_of_fields_to_profile):
                columns_being_dropped = self.columns_to_profile[
                    self.profiling_config.max_number_of_fields_to_profile:]
                self.columns_to_profile = self.columns_to_profile[:self.
                                                                  profiling_config
                                                                  .
                                                                  max_number_of_fields_to_profile]

                self.report.report_file_dropped(
                    f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})"
                )

        analysis_result = self.analyzer.run()
        analysis_metrics = AnalyzerContext.successMetricsAsJson(
            self.spark, analysis_result)

        # reshape distinct counts into dictionary
        column_distinct_counts = {
            x["instance"]: int(x["value"])
            for x in analysis_metrics if x["name"] == "ApproxCountDistinct"
        }

        select_numeric_null_counts = [
            count(when(
                isnan(c) | col(c).isNull(),
                c,
            )).alias(c) for c in self.columns_to_profile
            if column_types[column] in [DoubleType, FloatType]
        ]

        # PySpark doesn't support isnan() on non-float/double columns
        select_nonnumeric_null_counts = [
            count(when(
                col(c).isNull(),
                c,
            )).alias(c) for c in self.columns_to_profile
            if column_types[column] not in [DoubleType, FloatType]
        ]

        null_counts = dataframe.select(select_numeric_null_counts +
                                       select_nonnumeric_null_counts)
        column_null_counts = null_counts.toPandas().T[0].to_dict()
        column_null_fractions = {
            c: column_null_counts[c] / self.row_count
            for c in self.columns_to_profile
        }
        column_nonnull_counts = {
            c: self.row_count - column_null_counts[c]
            for c in self.columns_to_profile
        }

        column_unique_proportions = {
            c:
            (column_distinct_counts[c] /
             column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0)
            for c in self.columns_to_profile
        }

        if self.profiling_config.include_field_sample_values:
            # take sample and convert to Pandas DataFrame
            if self.row_count < NUM_SAMPLE_ROWS:
                # if row count is less than number to sample, just take all rows
                rdd_sample = dataframe.rdd.take(self.row_count)
            else:
                rdd_sample = dataframe.rdd.takeSample(False,
                                                      NUM_SAMPLE_ROWS,
                                                      seed=0)

        # init column specs with profiles
        for column in self.columns_to_profile:
            column_profile = DatasetFieldProfileClass(fieldPath=column)

            column_spec = _SingleColumnSpec(column, column_profile)

            column_profile.uniqueCount = column_distinct_counts.get(column)
            column_profile.uniqueProportion = column_unique_proportions.get(
                column)
            column_profile.nullCount = column_null_counts.get(column)
            column_profile.nullProportion = column_null_fractions.get(column)
            if self.profiling_config.include_field_sample_values:
                column_profile.sampleValues = [
                    str(x[column]) for x in rdd_sample
                ]

            column_spec.type_ = column_types[column]
            column_spec.cardinality = _convert_to_cardinality(
                column_distinct_counts[column],
                column_null_fractions[column],
            )

            self.column_specs.append(column_spec)
Exemplo n.º 8
0
    def ingest_table(self, full_path: str, relative_path: str,
                     is_aws: bool) -> Iterable[MetadataWorkUnit]:

        table_name = self.get_table_name(relative_path, full_path)

        # yield the table schema first
        logger.debug(
            f"Ingesting {full_path}: making table schemas {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
        )
        yield from self.get_table_schema(full_path, table_name, is_aws)

        # If profiling is not enabled, skip the rest
        if not self.source_config.profiling.enabled:
            return

        # read in the whole table with Spark for profiling
        table = self.read_file_spark(full_path, is_aws)

        # if table is not readable, skip
        if table is None:
            self.report.report_warning(
                table_name,
                f"unable to read table {table_name} from file {full_path}")
            return

        with PerfTimer() as timer:
            # init PySpark analysis object
            logger.debug(
                f"Profiling {full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler = _SingleTableProfiler(
                table,
                self.spark,
                self.source_config.profiling,
                self.report,
                full_path,
            )

            logger.debug(
                f"Profiling {full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            # instead of computing each profile individually, we run them all in a single analyzer.run() call
            # we use a single call because the analyzer optimizes the number of calls to the underlying profiler
            # since multiple profiles reuse computations, this saves a lot of time
            table_profiler.prepare_table_profiles()

            # compute the profiles
            logger.debug(
                f"Profiling {full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            analysis_result = table_profiler.analyzer.run()
            analysis_metrics = AnalyzerContext.successMetricsAsDataFrame(
                self.spark, analysis_result)

            logger.debug(
                f"Profiling {full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler.extract_table_profiles(analysis_metrics)

            time_taken = timer.elapsed_seconds()

            logger.info(
                f"Finished profiling {full_path}; took {time_taken:.3f} seconds"
            )

            self.profiling_times_taken.append(time_taken)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=make_dataset_urn(self.source_config.platform, table_name,
                                       self.source_config.env),
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProfile",
            aspect=table_profiler.profile,
        )
        wu = MetadataWorkUnit(
            id=f"profile-{self.source_config.platform}-{full_path}", mcp=mcp)
        self.report.report_workunit(wu)
        yield wu
Exemplo n.º 9
0
    def get_table_profile(self, table_data: TableData,
                          dataset_urn: str) -> Iterable[MetadataWorkUnit]:
        # read in the whole table with Spark for profiling
        table = None
        try:
            table = self.read_file_spark(
                table_data.table_path,
                os.path.splitext(table_data.full_path)[1])
        except Exception as e:
            logger.error(e)

        # if table is not readable, skip
        if table is None:
            self.report.report_warning(
                table_data.display_name,
                f"unable to read table {table_data.display_name} from file {table_data.full_path}",
            )
            return

        with PerfTimer() as timer:
            # init PySpark analysis object
            logger.debug(
                f"Profiling {table_data.full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler = _SingleTableProfiler(
                table,
                self.spark,
                self.source_config.profiling,
                self.report,
                table_data.full_path,
            )

            logger.debug(
                f"Profiling {table_data.full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            # instead of computing each profile individually, we run them all in a single analyzer.run() call
            # we use a single call because the analyzer optimizes the number of calls to the underlying profiler
            # since multiple profiles reuse computations, this saves a lot of time
            table_profiler.prepare_table_profiles()

            # compute the profiles
            logger.debug(
                f"Profiling {table_data.full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            analysis_result = table_profiler.analyzer.run()
            analysis_metrics = AnalyzerContext.successMetricsAsDataFrame(
                self.spark, analysis_result)

            logger.debug(
                f"Profiling {table_data.full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
            )
            table_profiler.extract_table_profiles(analysis_metrics)

            time_taken = timer.elapsed_seconds()

            logger.info(
                f"Finished profiling {table_data.full_path}; took {time_taken:.3f} seconds"
            )

            self.profiling_times_taken.append(time_taken)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            changeType=ChangeTypeClass.UPSERT,
            aspectName="datasetProfile",
            aspect=table_profiler.profile,
        )
        wu = MetadataWorkUnit(
            id=f"profile-{self.source_config.platform}-{table_data.table_path}",
            mcp=mcp)
        self.report.report_workunit(wu)
        yield wu
Exemplo n.º 10
0
 def test_Size(self):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Size()).run()
     # result_df = result.select('value').collect()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_df_row = result_df.select("value").collect()
     self.assertEqual(result_df_row, [Row(value=3.0)])
Exemplo n.º 11
0
 def Correlation(self, column1, column2, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Correlation(column1, column2, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     AnalyzerContext.successMetricsAsJson(self.spark, result)
     return result_df.select("value").collect()