def Mean(self, column, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Mean(column, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def prep_mean_value(self, column: str) -> None: if self.profiling_config.include_field_mean_value: self.analyzer.addAnalyzer(Mean(column))
from pydeequ.analyzers import ResultKey metrics_file = FileSystemMetricsRepository.helper_metrics_file(spark, 'verify_newfile.json','/home/ec2-user/') repository = FileSystemMetricsRepository(spark, metrics_file) key_tags = {'tag': 'analyzer'} resultKey = ResultKey(spark, ResultKey.current_milli_time(), key_tags) # In[12]: from pydeequ.analyzers import (Completeness, Compliance, ApproxCountDistinct, Size,Mean, Correlation, MutualInformation, PatternMatch) ######Profiling############# analysisResult = AnalysisRunner(spark) .onData(df) .addAnalyzer(Size()) .addAnalyzer(Completeness("Emp ID")) .addAnalyzer(ApproxCountDistinct('Emp ID')) .addAnalyzer(Mean('Salary')) .addAnalyzer(Completeness('Month of Joining')) .addAnalyzer(Compliance("Salary greater than 10000","Salary >=55000")) .addAnalyzer(Correlation('Age in Company (Years)','Age in Yrs')) .addAnalyzer(Correlation('Age in Yrs','Salary')) .addAnalyzer(Correlation('Age in Yrs','Weight in Kgs')) .addAnalyzer(MutualInformation(['Age in Yrs','Weight in Kgs'])) .addAnalyzer(MutualInformation(['Age in Yrs','Age in Company (Years)'])) .useRepository(repository) .saveOrAppendResult(resultKey) .run() # In[13]: repository.path # In[14]: ##refactor analysis result as dataframe analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)