예제 #1
0
 def Mean(self, column, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Mean(column, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect())
     return result_df.select("value").collect()
예제 #2
0
 def prep_mean_value(self, column: str) -> None:
     if self.profiling_config.include_field_mean_value:
         self.analyzer.addAnalyzer(Mean(column))
예제 #3
0
from pydeequ.analyzers import ResultKey

metrics_file = FileSystemMetricsRepository.helper_metrics_file(spark, 'verify_newfile.json','/home/ec2-user/')
repository = FileSystemMetricsRepository(spark, metrics_file)
key_tags = {'tag': 'analyzer'}

resultKey = ResultKey(spark, ResultKey.current_milli_time(), key_tags)


# In[12]:


from pydeequ.analyzers import (Completeness, Compliance, ApproxCountDistinct,
                               Size,Mean, Correlation, MutualInformation, PatternMatch)
######Profiling#############
analysisResult = AnalysisRunner(spark)                     .onData(df)                     .addAnalyzer(Size())                     .addAnalyzer(Completeness("Emp ID"))                     .addAnalyzer(ApproxCountDistinct('Emp ID'))                     .addAnalyzer(Mean('Salary'))                     .addAnalyzer(Completeness('Month of Joining'))                     .addAnalyzer(Compliance("Salary greater than 10000","Salary >=55000"))                     .addAnalyzer(Correlation('Age in Company (Years)','Age in Yrs'))                     .addAnalyzer(Correlation('Age in Yrs','Salary'))                     .addAnalyzer(Correlation('Age in Yrs','Weight in Kgs'))                     .addAnalyzer(MutualInformation(['Age in Yrs','Weight in Kgs']))                     .addAnalyzer(MutualInformation(['Age in Yrs','Age in Company (Years)']))                     .useRepository(repository)                     .saveOrAppendResult(resultKey)                     .run()


# In[13]:


repository.path


# In[14]:


##refactor analysis result as dataframe
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)