def filter_outliers(dataframe, exclude_columns): """ For every feature, except those in exclude_columns, set all outliers to NULL. """ for column in dataframe.columns: if column in exclude_columns: continue # Exclude boolean types. if dataframe.schema[column].dataType == BooleanType(): continue stats = dataframe \ .select(_mean(col(column)).alias('mean'), stddev(col(column)).alias('std')) \ .collect() mean = stats[0]['mean'] std = stats[0]['std'] print("mean: %s; std: %s" % (str(mean), str(std))) count_before = dataframe.filter(col(column).isNull()).count() dataframe = dataframe.withColumn( column, when(abs((col(column) - mean) / std) < 3, col(column)).otherwise(None)) print("Deleted %s entries because of z-score (3) for %s." % ( str(dataframe.filter(col(column).isNull()).count() - count_before), column)) return dataframe
def find(self, data, spark_context): rows, columns = spark_shape(data) n_clusters = get_n_clusters(data, data.columns[-1]) columns -= 2 mean_columns = map(lambda x: _mean(col(x)).alias('mean'), data.columns[:-2]) df_stats = data.select(*mean_columns).collect() df = add_iter(data) self.x_center = np.array(df_stats[0]) self.centroids = cluster_centroid(df, spark_context, n_clusters, 3) self.diameter = find_diameter(df, spark_context, 3) ch = float(rows - n_clusters) / float(n_clusters - 1) self.cluster_sizes = count_cluster_sizes(df, n_clusters, spark_context, 3) self.numerator = [0 for _ in range(n_clusters)] for i in range(0, n_clusters): self.numerator[i] = self.cluster_sizes[i] * euclidian_dist( self.centroids[i], self.x_center) denominator_sum = spark_context.accumulator(0) def f(row, denominator_sum, centroind): denominator_sum += np.sqrt( np.sum(np.square(np.array(row[:-3]) - centroind[row[-2]]))) df.rdd.foreach(lambda row: f(row, denominator_sum, self.centroids)) self.denominator = denominator_sum.value ch *= np.sum(self.numerator) ch /= self.denominator return -ch
def get_mean_and_std(all_harvest_df): # https://stackoverflow.com/a/47995478 df_stats = all_harvest_df.select( _mean(col('yield')).alias('mean'), _stddev(col('yield')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] return mean, std
def normalize_grade(self, df): print("Normalizing data ...") mean_df = df.groupBy(self.item_col).agg( _mean(self.grade_col).alias("mean")) #mean_df.show(100) df = df.join(mean_df, [self.item_col]) #df.show(100) df = df.withColumn(self.grade_col, col(self.grade_col) - col("mean")) #df.show(100) return df, mean_df
def get_prcp_day(df_in): # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS POR DIA') df_out = df_in.groupby('city', 'yr', 'mo', 'da').agg( _abs(_max('lat')).alias('latitude'), _sum('prcp').alias('prcp_dia'), _max('tmax').alias('tmax'), _min('tmin').alias('tmin'), _mean('temp').alias('med_temp')).orderBy('city', 'yr', 'mo', 'da') return df_out
def mean_stdv(df): unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType()) for i in ["count"]: assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect") scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled") pipeline = Pipeline(stages=[assembler, scaler]) df = pipeline.fit(df).transform(df).withColumn(i+"_Scaled", unlist(i+"_Scaled")).drop(i+"_Vect") df_stats = df.select(_mean(col('count_Scaled')).alias('mean'),_stddev(col('count_Scaled')).alias('std')).collect() # mean = df_stats[0]['mean'] # std = df_stats[0]['std'] return df_stats
def normalize_grade(self, df): # print("Normalizing data ...") mean_df = df.groupBy(self.user_col).agg( _mean(self.grade_col).alias("mean")) mean_df_rename = mean_df.withColumnRenamed(self.user_col, "USER") #mean_df.show(100) df = df.join(mean_df_rename, df[self.user_col] == mean_df_rename["USER"]).drop("USER") #df.show(100) df = df.withColumn(self.grade_col, col(self.grade_col) - col("mean")) #df.show(100) return df, mean_df_rename
async def df_sbx_customer_special_box_purchased(self, sbx, spark): data = await sbx.with_model('cart_box') \ .set_page_size(1000) \ .and_where_is_not_null('purchase') \ .and_where_is_equal('variety', os.environ['SPECIAL_BOX']).find() sc = spark.sparkContext def deleteMeta(d): dt = {} dt['customer'] = d['customer'] dt['total_items'] = d['total_items'] dt['current_percentage'] = d['current_percentage'] dt['count'] = 1 return dt dit = list(map(deleteMeta, data['results'])) tmp = sc.parallelize(dit, numSlices=100) df = spark.read.option("multiLine", "true").json(tmp) df2 = df.groupBy("customer").agg( func.avg("total_items").alias('total_items'), func.avg("current_percentage").alias('current_percentage'), func.sum("count").alias('count')) (cumean, custd, comean, costd, tmean, tstd) = df2.select( _mean(col('current_percentage')).alias('cumean'), _stddev(col('current_percentage')).alias('custd'), _mean(col('count')).alias('comean'), _stddev(col('count')).alias('costd'), _mean(col('total_items')).alias('total_items'), _stddev(col('total_items')).alias('total_items'), ).first() df3 = df2.withColumn( "acurrent_percentage", (col("current_percentage") - cumean) / custd).withColumn( "acount", (col("count") - comean) / costd).withColumn( "atotal_items", (col("total_items") - tmean) / tstd) vecAssembler = VectorAssembler( inputCols=["acurrent_percentage", "acount", "atotal_items"], outputCol="features") return vecAssembler.transform(df3)
def get_metrics(df_in): # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS') df_out = df_in.groupby('yr', 'city').agg( _max('elvt').alias('ele_max'), _mean('temp').alias('med_temp'), _mean('tmin').alias('med_temp_min'), _mean('tmax').alias('med_temp_max'), _sum('prcp').alias('prcp'), _mean('hmdy').alias('med_umi'), _mean('hmin').alias('med_umi_min'), _mean('hmax').alias('med_umi_max'), _mean('wdsp').alias('med_velo_vento'), _mean('gust').alias('med_velo_rajadas_vento')).orderBy('yr', 'city') return df_out
def stream_to_control_chart(time, rdd, streaming_dict, id): if rdd.isEmpty(): return df = rdd.map(lambda x: Row(**x)).toDF() columns = df.schema.names conditions_mean = [_mean(col(column)).alias(column) for column in columns] df = df.select(conditions_mean).toPandas() df["time_stamp"] = time.timestamp() * 1000 if id in streaming_dict: streaming_dict[id] = streaming_dict[id].append(df, ignore_index=True) else: streaming_dict[id] = df
def incomeZScore(): df_stats = dataset.select( _mean(col('Average_Income')).alias('mean'), _stddev(col('Average_Income')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = dataset.select( (dataset['Average_Income'] - mean) / std).withColumnRenamed( "((Average_Income - 58348.17333333333) / 9095.510688184871)", "z_score_AvgInc").alias("z_score_AvgInc") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = dataset.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) return final_df
def crimeZScore(): df2 = unemploymentZScore() df_stats = dataset.select( _mean(col('Crime_Percent')).alias('mean'), _stddev(col('Crime_Percent')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = dataset.select( (dataset['Crime_Percent'] - mean) / std).withColumnRenamed( "((Crime_Percent - 3.2683999999999975) / 0.8328317973490115)", "z_score_Crime") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = df2.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) final_df.show() return final_df
def unemploymentZScore(): df2 = incomeZScore() df_stats = dataset.select( _mean(col('Unemployment_Percent')).alias('mean'), _stddev(col('Unemployment_Percent')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = dataset.select( (dataset['Unemployment_Percent'] - mean) / std).withColumnRenamed( "((Unemployment_Percent - 7.450666666666669) / 2.512157640140963)", "z_score_Unem") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = df2.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) final_df.show() return final_df
def homelessZScore(): df2 = crimeZScore() df_stats = dataset.select( _mean(col('Homeless_Percent')).alias('mean'), _stddev(col('Homeless_Percent')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = dataset.select( (dataset['Homeless_Percent'] - mean) / std).withColumnRenamed( "((Homeless_Percent - 0.17706666666666662) / 0.09455791640084463)", "z_score_Homeless") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = df2.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) final_df.show() return final_df
def AverageAndStd(time, rdd, streaming_dict, id): if rdd.isEmpty(): return df = rdd.map(lambda x: Row(**x)).toDF() columns = df.schema.names conditions_mean = [ _mean(col(column)).alias(column + "_mean") for column in columns ] conditions_std = [ _stddev(col(column)).alias(column + "_stddev") for column in columns ] df = df.select(conditions_mean + conditions_std).toPandas() df["time_stamp"] = time.timestamp() * 1000 if id in streaming_dict: streaming_dict[id] = streaming_dict[id].append(df, ignore_index=True) else: streaming_dict[id] = df
def create_parquet_files(spark_session): """ Create a histogram for each feature Args: spark_session: dataframe to be processed Returns: dataframe """ all_features = spark_session.read.parquet('/user/***REMOVED***/StackOverflow/output_stackoverflow.parquet') all_features = all_features.filter(all_features['is_question']) all_results = [] for feature in INTEGER_FEATURES + FLOAT_FEATURES + BOOLEAN_FEATURES: # Replace all outliers with z-score above 2 with -1, unless they're boolean if feature not in BOOLEAN_FEATURES: stats = all_features \ .select(_mean(col(feature)).alias('mean'), stddev(col(feature)).alias('std')) \ .collect() mean = stats[0]['mean'] std = stats[0]['std'] all_features = all_features.withColumn(feature, when(abs((col(feature) - mean) / std) < 3, col(feature)).otherwise(-1)) if feature in FLOAT_FEATURES: # Bucketize each float feature into rounded number buckets all_features = all_features.withColumn(feature, round(col(feature), 2)) for resolved in [True, False]: new_file = all_features.filter(col('has_answer') == resolved) \ .select(feature) \ .groupBy(feature).count() filename = feature + '_1' if resolved else feature + '_0' new_file.write.mode('overwrite') \ .parquet('/user/***REMOVED***/StackOverflow/swashbuckler/output_' + filename + '.parquet') return all_results
def prepareData(): df_newOpiFac = newOpioidFactor() df_newOpiFac.show() df_AvgInc = homelessZScore() df_stats = df_newOpiFac.select( _mean(col('new_opioid_factor')).alias('mean'), _stddev(col('new_opioid_factor')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] df1 = df_newOpiFac.select( (df_newOpiFac['new_opioid_factor'] - mean) / std).withColumnRenamed( "((new_opioid_factor - 7447.090505536551) / 12252.01952320687)", "z_score_opioid").alias("z_score_opioid") df11 = df1.withColumn("columnindex", monotonically_increasing_id()) df22 = df_AvgInc.withColumn("columnindex", monotonically_increasing_id()) final_df = df22.join(df11, df22.columnindex == df11.columnindex, 'inner').drop(df11.columnindex).drop(df22.columnindex) return final_df
# df.printSchema() df = df.select(' _dewptm', ' _fog',' _pressurem', ' _rain', ' _tempm', \ ' _thunder', ' _vism', ' _wdird', ' _wspdm', ' _conds') cols = df.columns stages = [] # df.groupBy(" _conds") \ # .count() \ # .orderBy(col("count").desc()) \ # .show() df_stats = df.select( _mean(col(' _vism')).alias('mean_vism'), _mean(col(' _wdird')).alias('mean_wdird'), _mean(col(' _wspdm')).alias('mean_wspdm'), ).collect() mean_vism = df_stats[0]['mean_vism'] mean_wdird = df_stats[0]['mean_wdird'] mean_wspdm = df_stats[0]['mean_wspdm'] df = df.fillna({' _vism': mean_vism}) df = df.fillna({' _wdird': mean_wdird}) df = df.fillna({' _wspdm': mean_wspdm}) # print(mean_vism , mean_wdird , mean_wspdm) # numeric_features = [t[0] for t in df.dtypes if t[1] == 'int' or t[1] == 'float'or t[1] == 'double']
# columns to drop cols_to_drop = { 'CustomerID', 'ThreewayCalls', 'CurrentEquipementDays', 'HandsetRefurbished', 'TruckOwner', 'RVOwner', 'Homeownership', 'BuysViaMailOrder', 'NotNewCellphoneUser', 'OwnsMotorcycle' } # Drop columns churn = drop_columns(churn, cols_to_drop) # Deal with missing values churn = churn.filter(churn.ServiceArea.isNotNull()) # HandsetPrice handset_mean = churn.select(_mean("HandsetPrice").alias("mean")).first()[0] churn = churn.withColumn( "HandsetPrice", when(churn["HandsetPrice"] == "Unknown", handset_mean).otherwise(churn["HandsetPrice"])) # Get rid of nulls null_dict = find_null_counts(churn) null_set = set(null_dict.keys()) churn = remove_nulls(churn, null_set) # Columns to cast to different type string_columns = { "Churn", "ServiceArea", "ChildrenInHH", "HandsetWebCapable", "RespondsToMailOffers", "OptOutMailings", "NonUSTravel", "OwnsComputer", "HasCreditCard", "NewCellphoneUser", "MadeCallToRetentionTeam",
def main( output_folder="./www/stepchain", start_date=None, end_date=None, last_n_days=15, ): """Get step data in wmarchive. Each step array contains multiple steps. Udf function returns each step as a separate row in a list. flatMap helps to flat list of steps to become individual rows in dataframe. """ # Borrowed logic from condor_cpu_efficiency _yesterday = datetime.combine(date.today() - timedelta(days=1), datetime.min.time()) if not (start_date or end_date): end_date = _yesterday start_date = end_date - timedelta(days=last_n_days) elif not start_date: start_date = end_date - timedelta(days=last_n_days) elif not end_date: end_date = min(start_date + timedelta(days=last_n_days), _yesterday) if start_date > end_date: raise ValueError( f"start date ({start_date}) should be earlier than end date({end_date})" ) spark = get_spark_session() df_raw = spark.read.option("basePath", _DEFAULT_HDFS_FOLDER).json( get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER) ) \ .select(["data.*", "metadata.timestamp"]) \ .filter( f"""data.meta_data.jobstate='success' AND data.meta_data.jobtype='Production' AND data.wmats >= {start_date.timestamp()} AND data.wmats < {end_date.timestamp()} """ ) df_rdd = df_raw.rdd.flatMap(lambda r: udf_step_extract(r)) df = spark.createDataFrame(df_rdd, schema=get_schema()).dropDuplicates().where( _col("ncores").isNotNull()).cache() df_details = df.groupby(["task", "site", "step_name"]).agg( (100 * (_sum("jobCPU") / _mean("nthreads")) / _sum("jobTime")).alias("avg_cpueff"), _count(lit(1)).alias("#jobs"), _mean("steps_len").alias("#steps"), _mean("nthreads").alias("#nthreads"), _mean("ncores").alias("#ncores"), (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"), (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"), _collect_set("acquisitionEra").alias("acquisitionEra"), ).withColumn("avg_cpueff", _col("avg_cpueff").cast(IntegerType())).toPandas() df_task = df.groupby(["task"]).agg( (100 * (_sum("jobCPU") / _mean("nthreads")) / _sum("jobTime")).alias("avg_cpueff"), _count(lit(1)).alias("#jobs"), _mean("steps_len").alias("#steps"), _mean("nthreads").alias("#nthreads"), _mean("ncores").alias("#ncores"), (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"), (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"), ).withColumn("avg_cpueff", _col("avg_cpueff").cast(IntegerType())).toPandas() write_htmls(df_details, df_task, start_date, end_date, output_folder)
# 3. DATA PREPROCESSING # quit Name, PassengerID and Ticket (identifiers, Not attributes) df = df.select('Age', 'Fare', 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Survived', 'Cabin') # Dealing with missing values # CABIN: Quit Cabin. The 77.1% of the values are null. df = df.select('Age', 'Fare', 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Survived') # AGE: Substitute by Mean from pyspark.sql.functions import mean as _mean, stddev as _stddev, col df_stats = df.select( _mean(col('Age')).alias('mean'), _stddev(col('Age')).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] # mean=29.7=30 df = df.fillna(30, subset=['Age']) # FARE: Substitute by Mean df_stats = df.select(_mean(col('Fare')).alias('mean')).collect() mean = df_stats[0]['mean'] # mean=33.2955 df = df.fillna(33.2955, subset=['Fare']) # EMBARKED: Substitute by most common class df.groupBy(df['Embarked']).count().show() # The most common class is 'S'
# Standardization : Operation to transform a feature so that # mean = 0 and standard deviation = 1 for # the transformed aray of values def standardize(x, meanVal, stdVal): return (x - meanVal) / stdVal def standardize_udf(meanVal, stdVal): return udf(lambda x: standardize(x, meanVal, stdVal), FloatType()) # extract mean and standard deviation value for the column 'height_percentage' df_stats_hp = df.select( _mean(col('height_percentage')).alias('mean'), _stddev(col('height_percentage')).alias('std')).collect() mean_hp = df_stats_hp[0]['mean'] std_hp = df_stats_hp[0]['std'] # extract mean and standard deviation value for the column 'age' df_stats_age = df.select( _mean(col('age')).alias('mean'), _stddev(col('age')).alias('std')).collect() mean_age = df_stats_age[0]['mean'] std_age = df_stats_age[0]['std'] # perform simple standardization on the 'age' and 'height_percentage' column df = df.withColumn('height_percentage',
'INDUS_outliers', 'INDUS').show() bounded_df.filter(bounded_df.CHAS_outliers != 0).select( 'CHAS_outliers', 'CHAS').show() bounded_df.filter(bounded_df.NOX_outliers != 0).select('NOX_outliers', 'NOX').show() bounded_df.filter(bounded_df.RM_outliers != 0).select('RM_outliers', 'RM').show() ''' Spark not support visualization to data. here i used databricks display method to visualize the contrnt ''' display( bounded_df.select('CRIM_outliers', 'ZN_outliers', 'INDUS_outliers', 'CHAS_outliers', 'NOX_outliers', 'RM_outliers', 'AGE_outliers', 'DIS_outliers', 'RAD_outliers', 'TAX_outliers', 'PTRATIO_outliers', 'B_outliers', 'LSTAT_outliers', 'PRICE_outliers')) #Calculating Z - score val = df.select(df.INDUS.cast("int")) df_stats = df.select( _mean(df.INDUS.cast("double")).alias('mean'), _stddev(df.INDUS.cast("double")).alias('std'), ).collect() #Add score_INDUS in dataframe mean = df_stats[0]['mean'] std = df_stats[0]['std'] score_INDUS = df.withColumn("z score_INDUS", df.INDUS.cast("double") - mean / std) score_INDUS.show()
str(processCount) + ' - ' + filename) # Read file to dataset and apply all regex functions found_type = [] fileinfo = [] regex_res = [] df = sqlContext.read.format("csv").option( "header", "false").option("inferSchema", "true").option( "delimiter", "\t").schema(customSchema).load(inputDirectory + filename) df_stats = mean_stdv(df) mean = df_stats[0]['mean'] std = df_stats[0]['std'] count_all = count_all_values(df) #added col_length which is the average length of the col df_length = df.select(_mean(length(col("val"))).alias('avg_length')) col_length = df_length.collect()[0][0] percentage_website, found_type, type_count_web = re_find_website( df, count_all, found_type) percentage_zip, found_type, type_count_zip = re_find_zipCode( df, count_all, found_type) percentage_buildingCode, found_type, type_count_building = re_find_buildingCode( df, count_all, found_type) percentage_phoneNum, found_type, type_count_phone = re_find_phoneNum( df, count_all, found_type) percentage_lat_lon, found_type, type_count_lat_lon = re_find_lat_lon( df, count_all, found_type) percentage_add_st, found_type, type_count_add_st = re_find_street_address( df, count_all, col_length, found_type) percentage_school_name, found_type, type_count_school_name = re_find_school(
# print schema print(init_flat_data.printSchema(), '\n') # calculate min and max or order date in order to calculate recency max_order_date, min_order_date = init_flat_data \ .select( _max(col('order_date')), _min(col('order_date'))) \ .take(1)[0] # calculate recency/frequency and monetary calculate_diff_day = udf(lambda x: (max_order_date - x).days, IntegerType()) rfm_table = init_flat_data \ .withColumn('recency', calculate_diff_day('order_date')) \ .groupby(['company_id', 'company_name', 'country']) \ .agg( _mean(col('recency')).alias('recency'), _count(col('order_id')).alias('frequency'), _sum(col('NBI')).alias('monetary') ) # calculate quantiles for each variable quantiles = rfm_table.approxQuantile(['recency', 'frequency', 'monetary'], [0.20, 0.4, 0.6, 0.8], 0) r_quantile = quantiles[0] f_quantile = quantiles[1] m_quantile = quantiles[2] # calculate score of each variable def_r_score = udf( lambda x: 5 if x < r_quantile[0] else 4 if x < r_quantile[1] else 3 if x < r_quantile[2] else 2 if x < r_quantile[3] else 1, IntegerType())
from pyspark.sql import SparkSession from pyspark.sql.functions import sum as _sum, udf, col, mean as _mean from pyspark.sql.types import StringType import time spark = SparkSession.builder.appName('luigi').getOrCreate() # avoid crating a new spark session everytime the code is executed dfG = spark.read.csv('/data/ethereum/transactions', header=True).select('block_timestamp', 'gas').orderBy("block_timestamp") # take the csv without the header, rename columns 2 ("to_address") and 3 ("value") renaming them c2 and c0, group over c0 summing c3 udfG = udf(lambda x: time.strftime("%Y %m", time.gmtime(x)), StringType()) #Define UDF function dfG = dfG.withColumn( 'time', udfG(col('block_timestamp').cast("integer"))).groupBy("time").agg( _mean("gas")).orderBy("time") #dfG.show() dfG.repartition(1).write.csv("outCSpark", sep=",", header=True)
def get_et0(df_in): # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS') df_out = df_in.groupby('city', 'yr', 'mo').agg( _mean('ET0').alias('ET0_MES')).orderBy('city', 'yr', 'mo') return df_out
(dmin is not null AND trim('dmin') != "") AND (hmdy is not null AND trim('hmdy') != "") AND (hmax is not null AND trim('hmax') != "") AND (hmin is not null AND trim('hmin') != "") AND (wdsp is not null AND trim('wdsp') != "") AND (wdct is not null AND trim('wdct') != "") AND (gust is not null AND trim('gust') != "") ''') print(df_clima_new.count()) print(df_clima.count()) # Remove os valores NULL ou '' dos campos de valor df_clima_agg = df_clima.groupby('city', 'yr').agg( _mean('prcp').alias('prcp_mean'), _mean('stp').alias('stp_mean'), _mean('smax').alias('smax_mean'), _mean('smin').alias('smin_mean'), _mean('gbrd').alias('gbrd_mean'), _mean('temp').alias('temp_mean'), _mean('dewp').alias('dewp_mean'), _mean('tmax').alias('tmax_mean'), _mean('dmax').alias('dmax_mean'), _mean('tmin').alias('tmin_mean'), _mean('dmin').alias('dmin_mean'), _mean('hmdy').alias('hmdy_mean'), _mean('hmax').alias('hmax_mean'), _mean('hmin').alias('hmin_mean'), _mean('wdsp').alias('wdsp_mean'), _mean('wdct').alias('wdct_mean'),