def extractData(days): my_window = Window.partitionBy("vin").orderBy("normaltime") my_next_window = Window.partitionBy("vin").orderBy(desc("normaltime")) dfSchema = build_schema("conv") df = None for day in days: df_1s = load1sDataPerDay(day, dfSchema) df_5s = load5sDataPerDay(day) if not (df_1s is None or df_5s is None): df_tmp = df_1s.join(df_5s, ["vin", "normaltime"], "inner").withColumn("normaltime", to_timestamp(col("normaltime"), normaltimeFormat)) df_tmp = df_tmp.withColumn("next_normaltime", F.lag(df_tmp.normaltime).over(my_next_window)) df_tmp = df_tmp.withColumn("prev_normaltime", F.lag(df_tmp.normaltime).over(my_window)) df_tmp = df_tmp.withColumn("prev_diff", F.when(F.isnull(df_tmp.normaltime.cast("long") - df_tmp.prev_normaltime.cast("long")), 1000).otherwise(df_tmp.normaltime.cast("long") - df_tmp.prev_normaltime.cast("long"))) df_tmp = df_tmp.withColumn("next_diff", F.when(F.isnull(df_tmp.next_normaltime.cast("long") - df_tmp.normaltime.cast("long")), 1000).otherwise(df_tmp.next_normaltime.cast("long") - df_tmp.normaltime.cast("long"))) df_tmp = df_tmp.where("(prev_diff >= 60 or next_diff >= 60) and tel_latitudedeg > 0 and tel_longitudedeg > 0") print('{} starting/ending rows ..'.format(df_tmp.count())) if df is None: df = df_tmp else: df = df.union(df_tmp) print('{} processing ..'.format(day)) if not (df is None): print('{} rows loaded ..'.format(df.count()))
def replaceNull(self, value, subset=None): isDate = False isTimestamp = False try: if isinstance(value, str): date_obj = datetime.datetime.strptime( value, "%Y-%m-%d") #YYYY-MM-DD format e.g "2020-10-01" isDate = True except ValueError: isDate = False try: if isinstance(value, str): date_obj = datetime.datetime.strptime( value, "%Y-%m-%dT%H:%M:%S" ) #YYYY-MM-DDThh:mm:ss format e.g "2020-10-01T19:50:06" isTimestamp = True except ValueError: isTimestamp = False if isDate and subset is not None: dateCol = (x for x in self.inputSchema if str(x.dataType) == "DateType" and x.nullable == True and x.name in subset) for x in dateCol: self.inputDf = self.inputDf.withColumn( x.name, when(isnull(col(x.name)), lit(value)).otherwise(col(x.name))) elif isDate and subset is None: dateCol = (x for x in self.inputSchema if str(x.dataType) == "DateType" and x.nullable == True) for x in dateCol: self.inputDf = self.inputDf.withColumn( x.name, when(isnull(col(x.name)), lit(value)).otherwise(col(x.name))) elif isTimestamp and subset is not None: tsCol = (x for x in self.inputSchema if str(x.dataType) == "TimestampType" and x.nullable == True and x.name in subset) for x in tsCol: self.inputDf = self.inputDf.withColumn( x.name, when(isnull(col(x.name)), lit(value)).otherwise(col(x.name))) elif isTimestamp and subset is None: tsCol = ( x for x in self.inputSchema if str(x.dataType) == "TimestampType" and x.nullable == True) for x in tsCol: self.inputDf = self.inputDf.withColumn( x.name, when(isnull(col(x.name)), lit(value)).otherwise(col(x.name))) else: self.inputDf = self.inputDf.fillna(value, subset) return self.inputDf
def preprocess(df): """Drops null values in customer_id and global_product_id and calculates purchase count of each product by customer. Args: df (DataFrame): raw data from CSV file Returns: DataFrame with columns customer_id, global_product_id, count """ logger.info('Raw data contains {:,} rows'.format(df.count())) df = (df.select( col('customer_id').cast('integer'), col('global_product_id').cast('integer')).filter( ~isnull('customer_id') & ~isnull('global_product_id'))) df.cache() logger.info('Cleaned data contains {:,} rows'.format(df.count())) df = df.groupBy('customer_id', 'global_product_id').count() logger.info('Customer product purchases contains {:,} rows'.format( df.count())) logger.info('Customer product purchases contains {:,} customers'.format( df.select('customer_id').distinct().count())) logger.info('Customer product purchases contains {:,} products'.format( df.select('global_product_id').distinct().count())) logger.info('Customer total product purchases summary statistics:') df.select('count').describe().show() return df
def add_has_user_page(wmhist, page_history, remember_dict): user_pages = page_history.filter(f.col("page_namespace_historical")==2) user_pages = user_pages.select([f.col("wiki_db").alias("up_wiki_db"), f.col("page_id").alias("user_page_id"), f.col("page_title_historical").alias("user_page_title"), f.col("page_first_edit_timestamp").alias("user_page_first_edit"), f.col("start_timestamp").alias("user_page_start_timestamp"), f.col("end_timestamp").alias("user_page_end_timestamp") ]) user_pages = user_pages.filter( (f.col("page_is_redirect") == False) & (f.col("page_is_deleted") == False)) join_cond = [wmhist.wiki_db == user_pages.up_wiki_db, wmhist.event_user_text_historical == user_pages.user_page_title, wmhist.event_timestamp > user_pages.user_page_first_edit, wmhist.event_timestamp >= user_pages.user_page_start_timestamp, ((wmhist.event_timestamp < user_pages.user_page_end_timestamp) | f.isnull(f.col("user_page_end_timestamp")))] wmhist = wmhist.join(user_pages, on = join_cond, how="left_outer") wmhist = wmhist.withColumn("has_user_page", f.isnull(wmhist.user_page_id) == False) return((wmhist, remember_dict))
def cleanup_no_activity_rows(df, activity_field='activity_dt'): """ each df that was joined can produce "empty" activity rows for clients that didn't have activity in that df's activity this can blow up so you have "empty" rows for clients that did have activity in one of the dfs. this just cleans it up so that (client_id, branch) combos that have zero activity only get one row """ df_has_activity = df.filter("{} is not null".format(activity_field))\ .select([ F.col('client_id').alias('client_id_temp'), F.col('branch').alias('branch_temp') ]).distinct() df = df.join(df_has_activity, F.isnull(F.col('activity_dt')) & (F.col('client_id') == F.col('client_id_temp')) & (F.col('branch') == F.col('branch_temp')), how='left') df = df.filter(F.isnull(F.col('client_id_temp'))) df = df.drop('client_id_temp').drop('branch_temp') return df
def _get_telemetry_sanity_check_metrics(self, enrollments, df): """Return aggregations that check for problems with a client.""" # TODO: Once we know what form the metrics library will take, # we should move the below metric definitions and documentation # into it. if dict(df.dtypes).get('experiments') != 'map<string,string>': # Not all tables have an experiments map - can't make these checks. return [] return [ # Check to see whether the client_id is also enrolled in other branches # E.g. indicates cloned profiles. Fraction of such users should be # small, and similar between branches. F.max( F.coalesce((df.experiments[self.experiment_slug] != enrollments.branch).astype('int'), F.lit(0))).alias('has_contradictory_branch'), # Check to see whether the client_id was sending data in the conversion # window that wasn't tagged as being part of the experiment. Indicates # either a client_id clash, or the client unenrolling. Fraction of such # users should be small, and similar between branches. F.max( F.coalesce( (~F.isnull(df.experiments) & F.isnull( df.experiments[self.experiment_slug])).astype('int'), F.lit(0))).alias('has_non_enrolled_data'), ]
def handle_missing_get_indicator_column(df, input_column, expected_type): """Helper function used to get an indicator for all missing values.""" dcol = df[input_column].cast(expected_type) if isinstance(expected_type, StringType): indicator = sf.isnull(dcol) | (sf.trim(dcol) == "") else: indicator = sf.isnull(dcol) | sf.isnan(dcol) return indicator
def task_1(data_io, review_data, product_data): # -----------------------------Column names-------------------------------- # Inputs: asin_column = 'asin' overall_column = 'overall' # Outputs: mean_rating_column = 'meanRating' count_rating_column = 'countRating' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ data = review_data.groupBy(F.col(asin_column)).agg( F.avg(F.col(overall_column)).alias(mean_rating_column), F.count("*").alias(count_rating_column)) merged = product_data.join(data, on=asin_column, how='left') aggregate_func = merged.agg( F.count("*"), F.avg(F.col(mean_rating_column)), F.variance(F.col(mean_rating_column)), F.sum(F.isnull(F.col(mean_rating_column)).astype("int")), F.avg(F.col(count_rating_column)), F.variance(F.col(count_rating_column)), F.sum(F.isnull(F.col(count_rating_column)).astype("int"))).collect()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- # Calculate the values programmaticly. Do not change the keys and do not # hard-code values in the dict. Your submission will be evaluated with # different inputs. # Modify the values of the following dictionary accordingly. res = { 'count_total': None, 'mean_meanRating': None, 'variance_meanRating': None, 'numNulls_meanRating': None, 'mean_countRating': None, 'variance_countRating': None, 'numNulls_countRating': None } # Modify res: res['count_total'] = aggregate_func[0] res['mean_meanRating'] = aggregate_func[1] res['variance_meanRating'] = aggregate_func[2] res['numNulls_meanRating'] = aggregate_func[3] res['mean_countRating'] = aggregate_func[4] res['variance_countRating'] = aggregate_func[5] res['numNulls_countRating'] = aggregate_func[6] # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_1') return res
def check(decimal_columns): precision=38 scale=10 if type(decimal_columns)==tuple: name=decimal_columns[0] precision=decimal_columns[1] scale=decimal_columns[2] else: name=decimal_columns return ~isnull(col(name))&isnull(col(name).cast(DecimalType(precision,scale)))
def preprocess(self, df: DataFrame): preprocess_df = df.filter(~F.isnull("se_property")) \ .withColumn("se_label", F.lower(F.col("se_label"))) \ .filter(~F.isnull("se_label")) \ .withColumn("se_property_type", self.classify("event_sub_type", "se_label")) \ .withColumn("isSuspect", F.col("isSuspect").cast("int")) \ .filter(F.col("author_id").isNotNull() | F.col("discovery_id").isNotNull()) \ .drop_duplicates(["event_id", "user_token", "device_id", "user_ipaddress", "isSuspect"]) \ .withColumn("hour", F.hour("collector_tstamp")) return preprocess_df
def union_label_feature(labelDF, itemprofileDF): DF1 = labelDF.join( itemprofileDF, labelDF.item2 == itemprofileDF.pid, "left_outer").withColumnRenamed( "value", "features2").drop("pid").where(F.isnull('features2') == False) DF2 = DF1.join( itemprofileDF, DF1.item1 == itemprofileDF.pid, "left_outer").withColumnRenamed( 'value', 'features1').drop("pid").where(F.isnull('features1') == False) return DF2
def add_duration_id(spark, df, logger): """Calculate the visitduration_id by splitting the visit duration into buckets""" durdays_df = df.withColumn("duration_days", datediff("depdate", "arrdate")) ddbucketizer = Bucketizer(splits=[ float('-Inf'), 0, 4, 8, 11, 15, 22, 29, float('Inf') ], inputCol="duration_days", outputCol="ddbuckets") ddbuck_df = ddbucketizer.setHandleInvalid("keep").transform(durdays_df) dur_id_df = ddbuck_df.withColumn("visitduration_id", when(isnull(col("arrdate")) | isnull(col("depdate")), 999)\ .otherwise(col("ddbuckets").cast(IntegerType())) ) logger.info("Added duration_id") return dur_id_df
def test_null(frame, to_check): frame = frame.toDF() test_result = [ x for x in to_check if frame.select(count(when(isnull(x), x))).collect()[0][0] > 0 ] return len(test_result) > 0, ", ".join(test_result) + "contain null values"
def get_nulls(df): """ https://github.com/paaarx Returns a DataFrame with null count. For every column in DataFrame, analyzes each row to search for null values and count how many are found. Only columns with null values will be returned. Parameters: df (DataFrame): The DataFrame to be analyzed. Returns: DataFrame: DataFrame with columns and null count. """ expression = [] for column in df.columns: expression.append(count(when(isnull(column), column)).alias(column)) df_with_nulls = df.select(expression) column_list = [] for key, value in df_with_nulls.collect()[0].asDict().items(): if value > 0: column_list.append(key) return df_with_nulls.select(column_list)
def transformCsvToDbDataFile(csvFile): """ Prepare a suitable format to be saved into Postgres database. General rules are 1. Look for minimum nights >= 5 & maximum nights <= 30 2. Look amenities such as Wifi, TV, and Internet 3. Replace the $ signs in price & weekly_price with empty string 4. Convert minimum nights and maximum nights to Integer 5. Convert price & weekly_price to Double 6. When weekly_price is null then set it as 0 """ return csvFile.select("id", "listing_url", "amenities", "minimum_nights", "maximum_nights", "price", "weekly_price", "city", "country")\ .filter(csvFile["amenities"].contains("Internet"))\ .filter(csvFile["amenities"].contains("Wifi"))\ .filter(csvFile["amenities"].contains("TV"))\ .filter(csvFile["price"].contains("$"))\ .withColumn("id", csvFile["id"].cast(IntegerType()))\ .withColumn("minimum_nights", csvFile["minimum_nights"].cast(IntegerType()))\ .withColumn("maximum_nights", csvFile["maximum_nights"].cast(IntegerType()))\ .withColumn("price", F.regexp_replace("price", "\\$", "").cast(DoubleType()))\ .withColumn("weekly_price", F.when(F.isnull(csvFile["weekly_price"]), 0.0) .otherwise(F.regexp_replace("weekly_price", "\\$", "").cast(DoubleType())))\ .where(csvFile.colRegex("minimum_nights") >= 5)\ .where(csvFile.colRegex("maximum_nights") <= 30)
def _clean_data(self, data, stored_missing_values=None): missing_values = {} data_handling = self.data_settings.get('data_handling', {}) features_handling = data_handling.get('features_handling', {}) # remove features by null percentage null_percentage = data_handling.get("feature_remove_by_null_percentage", 0.5) null_percentages = data.select( [(F.count(F.when(F.isnull(c), c)) / data.count()).alias(c) for c in data.columns]).collect()[0] data = data.select([c for c in data.columns if null_percentages[c] < null_percentage]) # filling missing values by function/value if len(features_handling.keys()) > 0: missing_values = { k: v['fillna'] if not isinstance(v.get('fillna', 'mean'), str) else data.agg((eval('F.' + v.get('fillna', 'mean')))(k)).collect()[0][0] for (k, v) in features_handling.items() } # filling default missing features by mean default_missing_features = list(set(data.columns).difference(set(list(features_handling.keys())))) default_missing_values = data.select([F.mean(c).alias(c) for c in default_missing_features]).collect()[0] missing_values.update({c: default_missing_values[c] for c in default_missing_features}) self.save_metadata('missing_values', missing_values) if stored_missing_values is not None: data = data.fillna(stored_missing_values) else: data = data.fillna(missing_values) return data
def remove_positive_samples_from_negative_samples(neg_samples, pos_samples): pos_samples_to_remove = pos_samples.select("date", "hour", "street_id", lit(1).alias("exists")) neg_samples = (neg_samples.join( pos_samples_to_remove, ["date", "hour", "street_id"], "left_outer").filter(isnull("exists")).drop("exists")) return neg_samples
def get_most_tornados(station_weather_data): w1 = Window.partitionBy("COUNTRY_FULL").orderBy( ["COUNTRY_FULL", "YEARMODA"]) w2 = Window.partitionBy("DIFF").orderBy("COUNTRY_FULL") tornado_data = station_weather_data.select( ['COUNTRY_FULL', "YEARMODA", "FRSHTT"]).where("FRSHTT == '10011'") tornado_data = tornado_data.withColumn( "PREV", F.lag(tornado_data.YEARMODA).over(w1)) tornado_data = tornado_data.withColumn( "DIFF", F.when(F.isnull(tornado_data.YEARMODA - tornado_data.PREV), 0).otherwise(tornado_data.YEARMODA - tornado_data.PREV)) tornado_data = tornado_data \ .withColumn("GRP", F.row_number().over(w1) - F.row_number().over(w2)) \ .withColumn("STREAK", F.row_number().over(Window.partitionBy("GRP").orderBy(["COUNTRY_FULL", "YEARMODA"]))) first_row = tornado_data.orderBy(F.desc("STREAK")).take(1)[0] country = first_row[0] value = first_row[-1] return country, value
def remove_features_by_null_threshold(self, data, percentage=0.3): """ Removing data with amount of 'nulls' more then the 'percentage' :param data: the DataFrame :param percentage: percentage - default 30% :return: pandas DataFrame """ null_percentages = data.select([ (F.count(F.when(F.isnull(c), c)) / data.count()).alias(c) for c in data.columns ]).collect()[0] n_features = len(data.columns) data = data.select( [c for c in data.columns if null_percentages[c] < percentage]) new_n_features = len(data.columns) if n_features == new_n_features: print( "Features number was not changed, did not found null features more than %0.2f percentage" % percentage) else: print( "%d Features has removed, new data shape is (%d,%d)" % ((n_features - new_n_features), data.shape[0], data.shape[1])) return data
def Sparkseeds(dict, i, k, hashDF, sc): word = [(i, HashTable.hash_djb2(dict[i][j:j + k]), j) for j in range(0, len(dict[i]) - k)] rddW = sc.parallelize(word) schemaWordDF = rddW.map( lambda x: Row(NUM_SEQ=x[0], ID_SEQ=x[1], POS_SEQ=x[2])) df = sqlContext.createDataFrame(schemaWordDF) reDF = df.join(hashDF, df.ID_SEQ == hashDF.ID_GEN, how='inner') reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.POS_GEN) my_window = Window.partitionBy(reDF.NUM_SEQ).orderBy(reDF.POS_SEQ) reDF = reDF.withColumn("prev_value", F.lag(reDF.POS_SEQ).over(my_window)) reDF = reDF.withColumn( "dist", F.when(F.isnull(reDF.POS_SEQ - reDF.prev_value), 0).otherwise(reDF.POS_SEQ - reDF.prev_value)) reDF = reDF.select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.dist, reDF.POS_GEN) reDF = reDF.withColumn("dist0", F.lead(reDF.dist).over(my_window)) elDF = reDF.filter(((reDF.dist == 0) | (reDF.dist >= 50)) & ((reDF.dist0.isNull()) | (reDF.dist0 >= 50))) reDF = reDF.subtract(elDF) reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.POS_GEN) #pos = function(reDF) return reDF
def calculate_average_distance(vehicles_evts_df, op_prd_evts_df): """ calculate average distance in an operating period of all vehicles and per vehicle as well. :param vehicles_evts_df: vehicle events DF. :param op_prd_evts_df: operating periods DF. :return: DF with col `distance` """ # so that we could join both data frames. vehicles_evts_df = vehicles_evts_df.withColumn('key', F.lit(1)) op_prd_evts_df = op_prd_evts_df.withColumn('key', F.lit(1)) df_merge = vehicles_evts_df.join(op_prd_evts_df, on='key', how='left').drop('key') df_merge = df_merge \ .withColumn('lng', F.toRadians('lng')) \ .withColumn('lat', F.toRadians('lat')) w = Window().partitionBy('op_prd_id', 'vehicle_id').orderBy("at") df = df_merge.withColumn( 'distance', calculate_distance('lng', 'lat', F.lag('lng', 1).over(w), F.lag('lat', 1).over(w))).alias('distance') df = df.withColumn( 'distance', F.when(F.isnull(df['distance']), 0).otherwise(df['distance'])).alias('distance') return df
def __init__(self): super(FeatureRequestIntervalVariance, self).__init__() self.w = Window.partitionBy( F.col('client_request_host'), F.col('client_ip') ).orderBy(F.col("@timestamp")) self.group_by_aggs = { 'request_interval_var': F.variance( F.col('request_interval').cast('float') / 60. ), } self.pre_group_by_calcs = { 'row_num_per_group': F.row_number().over(self.w), 'prev_ts': F.lag(F.col('@timestamp')).over( self.w), 'request_interval': F.when( F.col('row_num_per_group') > 1, F.when( F.isnull( F.col('@timestamp').cast('long') - F.col('prev_ts').cast('long') ), 0 ).otherwise( F.col('@timestamp').cast('long') - F.col('prev_ts').cast('long') )).otherwise(None), }
def main( salt, iterations, klen, project, input_table, output_table, bucket, ): spark = (SparkSession.builder.appName("adjust_gps_hash").getOrCreate()) @udf("string") def pbkdf2_sha1hmac(msg, salt, iterations, klen): import hashlib import base64 return base64.b64encode( hashlib.pbkdf2_hmac('sha1', str.encode(msg), str.encode(salt), iterations, klen)).decode() (spark.read.format("bigquery").option("table", f"{project}.{input_table}"). load().where(~isnull("gps_adid")).withColumn( "identifier", pbkdf2_sha1hmac(col("gps_adid"), lit(salt), lit(iterations), lit(klen))).select( "identifier", "installed_at").write.format("bigquery").option( "table", f"{project}.{output_table}").option( "temporaryGcsBucket", bucket).mode("overwrite").save()) spark.stop()
def remove_positive_samples_from_negative_samples(neg_samples, pos_samples): pos_samples_to_remove = pos_samples.select('date', 'hour', 'street_id', lit(1).alias('exists')) neg_samples = (neg_samples.join( pos_samples_to_remove, ['date', 'hour', 'street_id'], "left_outer").filter(isnull('exists')).drop('exists')) return neg_samples
def cleanNullVisitors(self, adsDf): totalRecords = adsDf.count() adsFilteredVisitors = adsDf.filter(~isnull("visitorId")) totalRecordsNull = adsFilteredVisitors.count() print("Total number of records::" + str(totalRecords)) print("Removed visitors with null visitorId count::" + str(totalRecordsNull)) return adsFilteredVisitors
def l2_to_l3(input_df): df = input_df df = df.sort(df.LAST_UPDT_DATE_TIME.desc()) window = Window.partitionBy(df['INV_ID']).orderBy(df['LAST_UPDT_DATE_TIME'].desc()) df = df.select('*', row_number().over(window).alias('row_number')).filter(col('row_number') == 1) delete_is_null = F.isnull(F.col("LOG_DEL_IND")) df = df.where(delete_is_null) return df
def filter_df(self, df): return df.filter( ~F.isnull(df.experiments[self.experiment_slug]) ).filter( df.submission_date_s3 >= self.start_date ).withColumn( 'branch', df.experiments[self.experiment_slug] )
def null_values(df): schema = {col: col_type for col, col_type in df.dtypes} nulls = [ json.loads(x) for x in df.select([ count(when(isnull(c), c)).alias(c) for c, type in schema.items() if type != "timestamp" ]).toJSON().collect() ][0] return nulls
def get_growth(data, key): data = sc.parallelize(data.collect()[key][1]) df = sqlc.createDataFrame(data, ["date", "value"]) my_window = Window.partitionBy().orderBy("date") df = df.withColumn("prev_value", F.lag(df.value).over(my_window)) df = df.withColumn("diff", F.when(F.isnull(((df.value - df.prev_value)/df.prev_value)*100), 0) .otherwise((df.value - df.prev_value)/df.prev_value)*100) return df.rdd.map(lambda x: x.date.encode("utf-8")).collect(), df.rdd.map(lambda x: x.diff).collect()
def get_missing(self): nulls = self.data_frame.select(*(when(isnull(c) == 'true', 1).otherwise(0).alias(c) for c in self.get_variables_segregated()[0])) agrupaded = nulls.groupBy().sum(*self.get_variables_segregated()[0]) final = agrupaded.select(*(agrupaded['sum(' + c + ')'].alias(c) for c in self.get_variables_segregated()[0])). \ toPandas().transpose() final = final.reset_index() return spark_session.createDataFrame(final).\ withColumnRenamed('index', 'Variavel').\ withColumnRenamed('0', 'missing')
def transform(self, df, lowerPerc=None, upperPerc=None): # check if trained if not self._is_trained: raise ValueError('You mas call train method first') # lower filtering if lowerPerc is not None and lowerPerc>0 and lowerPerc<100: lowerFilter = self.ntiles_[:,int(lowerPerc)][:,1] for i,feature in enumerate(self.columns_): df = df.where((df[feature] > lowerFilter[i]) | F.isnull(df[feature])) # upper filtering if upperPerc is not None and upperPerc>0 and upperPerc<100: upperFilter = self.ntiles_[:,int(upperPerc)][:,1] for i,feature in enumerate(self.columns_): df = df.where((df[feature] < upperFilter[i]) | F.isnull(df[feature])) return df
redemptionsDf = redDf.select(redDf["EnqueuedDateTimeUTC"], redDf["jsonBody.redemption.rewardsRedemptionId"] , redDf["jsonBody.redemption.rewardsTransactionId"] , redDf["jsonBody.redemption.correlationId"] , redDf["jsonBody.redemption.eventId"] , redDf["jsonBody.redemption.programName"] , redDf["jsonBody.redemption.createdDate"] , redDf["jsonBody.redemption.pointsRedeemed"] , redDf["jsonBody.redemption.accrualCreatedDate"] , redDf["jsonBody.redemption.accrualRewardsActivityName"], redDf["jsonBody.redemption.accrualBrandCode"] , redDf["jsonBody.redemption.accrualChannelSource"] , redDf["jsonBody.redemption.modifiedDate"],redDf["jsonBody.redemption.accrualStorenumber"] , redDf["jsonBody.redemption.accrualMerchantid"], redDf["jsonBody.redemption.typeCode"]) memberBalancesDf = memBalDf.select(memBalDf["EnqueuedDateTimeUTC"], memBalDf["jsonBody.membershipPointBalance.membershipId"] , memBalDf["jsonBody.membershipPointBalance.memberId"] , memBalDf["jsonBody.membershipPointBalance.programName"] , memBalDf["jsonBody.membershipPointBalance.currentLevelPoints"] , memBalDf["jsonBody.membershipPointBalance.modifiedDate"], memBalDf["jsonBody.membershipPointBalance.eventId"]) # COMMAND ---------- # we dont need accrual record of type Reward or pointsUsed <> 0.Additionally, an accrual record gets sent when a member change status from chase to non-chase or vicecersa accrualsDf = accrualsDf.filter("pointsUsed = '0'") accrualsDf = accrualsDf.filter("pointType = 'Reward'") accrualsDf = accrualsDf.filter("rewardsTransactionSubType <> 'Point Expiration Updated'") # COMMAND ---------- # derive tendertype from pyspark.sql import functions as F transactionsDf = transactionsDf.withColumn("tender_type", F.when(F.isnull(transactionsDf['cardNumber']),"Non-SVC").otherwise("Non-SVC")) # COMMAND ---------- # check the pointsAccrued and pointRedeemed columns as to whether they are number and filter only those valid number values from pyspark.sql import functions as F accrualsDf = accrualsDf.withColumn("pointsAccruedNumber", accrualsDf.pointsAccrued.cast('double')) accrualsNumberErrorDf = accrualsDf.where(accrualsDf["pointsAccruedNumber"].isNull()) accrualsDf = accrualsDf.where(accrualsDf["pointsAccruedNumber"].isNotNull()) redemptionsDf = redemptionsDf.withColumn("pointsRedeemedNumber", redemptionsDf.pointsRedeemed.cast('double')) memberBalancesDf = memberBalancesDf.withColumn("currentLevelPointsNumber", memberBalancesDf.currentLevelPoints.cast('double')) redemptionsNumberErrorDf = redemptionsDf.where(redemptionsDf["pointsRedeemedNumber"].isNull()) redemptionsDf = redemptionsDf.where(redemptionsDf["pointsRedeemedNumber"].isNotNull()) # COMMAND ----------