def prepare_df(df): num_rows = df.count() # Expand dates. df = expand_date(df) df = df \ .withColumn('Open', df.Open != '0') \ .withColumn('Promo', df.Promo != '0') \ .withColumn('StateHoliday', df.StateHoliday != '0') \ .withColumn('SchoolHoliday', df.SchoolHoliday != '0') # Merge in store information. store = store_csv.join(store_states_csv, 'Store') df = df.join(store, 'Store') # Merge in Google Trend information. google_trend_all = prepare_google_trend() df = df.join(google_trend_all, ['State', 'Year', 'Week']).select(df['*'], google_trend_all.trend) # Merge in Google Trend for whole Germany. google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE'] google_trend_de = google_trend_de.withColumnRenamed('trend', 'trend_de') df = df.join(google_trend_de, ['Year', 'Week']).select(df['*'], google_trend_de.trend_de) # Merge in weather. weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ['State', 'Date']) # Fix null values. df = df \ .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \ .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \ .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \ .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1))) # Days & months competition was open, cap to 2 years. df = df.withColumn('CompetitionOpenSince', F.to_date(F.format_string('%s-%s-15', df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth))) df = df.withColumn('CompetitionDaysOpen', F.when(df.CompetitionOpenSinceYear > 1900, F.greatest(F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)))) .otherwise(0)) df = df.withColumn('CompetitionMonthsOpen', (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # Days & weeks of promotion, cap to 25 weeks. df = df.withColumn('Promo2Since', F.expr('date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)')) df = df.withColumn('Promo2Days', F.when(df.Promo2SinceYear > 1900, F.greatest(F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since)))) .otherwise(0)) df = df.withColumn('Promo2Weeks', (df.Promo2Days / 7).cast(T.IntegerType())) # Check that we did not lose any rows through inner joins. assert num_rows == df.count(), 'lost rows in joins' return df
def count_time_since(self, col_for_time_since, time_since_date): '''Columns to be passed for calculating duration need to be in TimeStamped format''' '''time_since_date should be in dd/MM/yyyy format''' # print "COUNT TIME SINCE - " # self._metaHelperInstance = MetaDataHelper(self._data_frame, self._data_frame.count()) self._data_frame = self._data_frame.withColumn( col_for_time_since + '_temp', self.to_date_(col_for_time_since)) uniqueVals = self._data_frame.select( col_for_time_since + '_temp').distinct().na.drop().limit(1000).collect() try: date_format = self._metaHelperInstance.get_datetime_format( uniqueVals) self._data_frame = self._data_frame.withColumn( "TIME_SINCE_DATE", F.lit(time_since_date)) to_date_udf = udf( lambda x: datetime.strptime(x, date_format) if x != None else x, DateType()) self._data_frame = self._data_frame.withColumn( col_for_time_since + '_temp', to_date_udf(col(col_for_time_since + '_temp'))) self._data_frame = self._data_frame.withColumn( "TIME_SINCE_DATE(Timestamped)", to_timestamp(self._data_frame["TIME_SINCE_DATE"], "dd/MM/yyyy")) self._data_frame = self._data_frame.withColumn( col_for_time_since + "_time_since", datediff(self._data_frame["TIME_SINCE_DATE(Timestamped)"], self._data_frame[col_for_time_since + '_temp'])) self._data_frame = self._data_frame.drop( "TIME_SINCE_DATE", "TIME_SINCE_DATE(Timestamped)") except TypeError: self._data_frame = self._data_frame.withColumn( "TIME_SINCE_DATE", F.lit(time_since_date)) self._data_frame = self._data_frame.withColumn( "TIME_SINCE_DATE(Timestamped)", to_timestamp(self._data_frame["TIME_SINCE_DATE"], "dd/MM/yyyy")) self._data_frame = self._data_frame.withColumn( col_for_time_since + "_time_since", datediff(self._data_frame["TIME_SINCE_DATE(Timestamped)"], self._data_frame[col_for_time_since + '_temp'])) self._data_frame = self._data_frame.drop( "TIME_SINCE_DATE", "TIME_SINCE_DATE(Timestamped)") # self._data_frame = self._data_frame.withColumn(col_for_time_since, to_timestamp(self._data_frame[col_for_time_since], "dd/MM/yyyy").alias(col_for_time_since)) # self._data_frame = self._data_frame.withColumn(col_for_time_since, F.from_unixtime(F.unix_timestamp(self._data_frame[col_for_time_since]), "dd/MM/yyyy").alias(col_for_time_since)) self._data_frame = self._data_frame.drop(col_for_time_since + '_temp') return self._data_frame
def clean_data(df): get_people_type = udf(people_type, IntegerType()) get_trip_type = udf(trip_type, IntegerType()) compare = udf(lambda x, y: int(x != y), IntegerType()) greater_than_zero = udf(lambda x: max(x, 0)) df = df.na.fill({'orig_destination_distance': 0}) df = df.withColumn( 'people_type', get_people_type(df.srch_children_cnt, df.srch_adults_cnt)) # df.drop(['srch_children_cnt','srch_adults_cnt'], axis=1, inplace=True) # df.drop(['plan_days','travel_days'], axis=1, inplace=True) df = df.withColumn('foreign', compare(df.user_location_country, df.hotel_country)) df = df.withColumn('diff_conti', compare(df.posa_continent, df.hotel_continent)) df = df.withColumn('plan_hour', hour(df.date_time)) df1 = df.filter(df.is_booking == 1) df1 = df1.na.drop() df1 = df1.withColumn( 'plan_days', greater_than_zero(datediff(df1.srch_ci, df1.date_time))) df1 = df1.withColumn('travel_days', greater_than_zero(datediff(df1.srch_co, df1.srch_ci))) df1 = df1.withColumn('travel_month', month(df1.srch_ci)) df1 = df1.withColumn('price_multiplier', df1.travel_days * df1.srch_rm_cnt) df1 = df1.withColumn('trip_type', get_trip_type(df1.plan_days, df1.travel_days)) df1 = df1.select([ c for c in df1.columns if c not in ['is_booking', 'srch_ci', 'srch_co', 'date_time'] ]) df2 = df.filter(df.is_booking == 0) df2 = df2.select([ c for c in df2.columns if c not in ['is_booking', 'srch_ci', 'srch_co', 'date_time'] ]) df2 = df2.na.drop() hotel_prob(df1, PATH + 'hotel_destinations_all1.csv') hotel_prob(df2, PATH + 'hotel_destinations_all2.csv') return df1, df2
def process_for_counts_and_ageing_days(self, filtered_df): mapping_expr = F.create_map( [F.lit(x) for x in chain(*AGEING_DAY_WEIGHT_MAP.items())]) count_segregation_df = filtered_df.withColumn("SuccessFlag", F.when(filtered_df.CallStatus == 'Success', 1).otherwise(0))\ .withColumn("DropFlag", F.when(filtered_df.CallStatus == 'Drop', 1).otherwise(0))\ .withColumn("Age", F.datediff(F.lit(datetime.datetime.today().date()), F.col("CallDate")))\ .withColumn("AgeScore", mapping_expr.getItem(F.datediff(F.lit(datetime.datetime.today().date()), F.col("CallDate")))).\ groupBy(F.col("MSISDN"), F.col("CallDate"), F.col("CellId"), F.col('Age'), F.col('AgeScore')).\ agg(F.sum('SuccessFlag').alias('SuccessCount'), F.sum('DropFlag').alias('DropCount'), F.sum(F.col('SuccessFlag') + F.col('DropFlag')).alias("CellCallCount"), (F.sum('DropFlag') / F.sum(F.col('SuccessFlag')+F.col('DropFlag'))).alias('DropRatio')).\ orderBy(F.col('CellId'), ascending=True) return count_segregation_df
def experiment_pings_df(pings_df, membership_df, observation_period): """ get the subset of pings that are: 1: in the membership df (by client_id) 2: activity date is after enrollment (at least the day after) and before end of observation period note: if the pings_df has a branch column, join on (client_id, branch) """ # if observation period is negative, get the days preceding # if positive, get days following top = observation_period bottom = 0 if observation_period < 0: top = 0 bottom = observation_period # rename column so joining is easier pings_df = pings_df.withColumn('client_id_pings', F.col('client_id')).drop('client_id') # if branch exists, join on client_id, branch, date range if 'branch' in pings_df.columns: pings_df = pings_df.withColumn('branch_pings', F.col('branch')).drop('branch') df = membership_df.join( pings_df, (F.col('client_id') == F.col('client_id_pings')) & (F.col('branch') == F.col('branch_pings')) & (F.datediff(F.col('activity_dt'), F.col('enrollment_dt')) < top) & (F.datediff(F.col('activity_dt'), F.col('enrollment_dt')) > bottom), # no enroll day pings how='left') df = df.drop('branch_pings') else: df = membership_df.join( pings_df, (F.col('client_id') == F.col('client_id_pings')) & (F.datediff(F.col('activity_dt'), F.col('enrollment_dt')) < top) & (F.datediff(F.col('activity_dt'), F.col('enrollment_dt')) > bottom), # no enroll day pings how='left') # clean up columns df = df.drop('client_id_pings') return df
def glean_2(invoice_df, line_item_df): invoice = invoice_df.alias('invoice') line_item = line_item_df.alias('line_item') joined_table = invoice.join(line_item, invoice.invoice_id == line_item.invoice_id, how='left') glean2 = joined_table.groupBy( invoice.invoice_id, invoice.canonical_vendor_id, invoice.invoice_date, invoice.period_end_date).agg( max_('line_item.period_end_date').alias('max_line_end_date')) glean2 = glean2.withColumn( 'end_date', funcs.greatest('max_line_end_date', invoice.period_end_date)) glean2 = glean2.withColumn('difference', funcs.datediff('end_date', 'invoice_date')) glean2 = glean2[glean2['difference'] > 90] glean2 = glean2.withColumn('glean_location', funcs.lit('invoice')) glean2 = glean2.withColumn("glean_type", funcs.lit('accrual_alert')) glean2 = glean2.withColumn( "glean_text", funcs.concat( funcs.lit('Line items from vendor '), funcs.col('canonical_vendor_id'), funcs.lit(' in this invoice cover future periods (through '), funcs.col('end_date'), funcs.lit(' )'))) glean2 = glean2.withColumn('glean_date', funcs.col('invoice_date')) return glean2
def execute(self): """Execute the link. :returns: status code of execution :rtype: StatusCode """ ds = process_manager.service(DataStore) # --- your algorithm code goes here self.logger.debug('Now executing link: {link}.', link=self.name) df = ds[self.read_key] window = Window.orderBy(self.datetime_col) \ .rowsBetween(0, Window.unboundedFollowing) if self.partitionby_cols: window = window.partitionBy(self.partitionby_cols) date_col = f.to_date(self.datetime_col) # only dates per row reset_col = f.when(f.col(self.event_col) > 0, date_col) # dates of events bfilled_reset_col = f.first(reset_col, ignorenulls=True).over( window) # backfilled dates of events countdown_col = f.datediff( bfilled_reset_col, date_col) # difference between next event date and today ds[self.store_key] = df.withColumn(self.countdown_col_name, countdown_col) return StatusCode.Success
def generate_age_information(self, dcc_experiment_df: DataFrame, mice_df: DataFrame): experiment_df_a = dcc_experiment_df.alias("exp") mice_df_a = mice_df.alias("mice") dcc_experiment_df = experiment_df_a.join( mice_df_a, (experiment_df_a["specimenID"] == mice_df["_specimenID"]) & (experiment_df_a["_centreID"] == mice_df["_centreID"]), "left_outer", ) dcc_experiment_df = self._add_special_dates(dcc_experiment_df) dcc_experiment_df = dcc_experiment_df.withColumn( "ageInDays", datediff( when( col("_dateOfBloodCollection").isNotNull(), col("_dateOfBloodCollection"), ).when( col("_dateOfSacrifice").isNotNull(), col("_dateOfSacrifice")).otherwise( col("_dateOfExperiment")), col("mice._DOB"), ), ) dcc_experiment_df = dcc_experiment_df.withColumn( "ageInWeeks", udf(lambda x: math.floor(x / 7) if x is not None else None, IntegerType())(col("ageInDays")), ) return dcc_experiment_df.select("exp.*", "ageInWeeks", "ageInDays")
def transform_immigration(self, immigration_input_path: str, immigration_output_path: str, date_output_path: str) -> bool: # read immigration data imgDF = self._read_data(immigration_input_path, 'immigration') # transform print('Starting transform immigration dataset ...') timer = time.time() # clean demographic imgDF = self._cleaner.clean_immigration_data(imgDF) # extract dim dates self._extract_dim_date(imgDF, date_output_path) imgDF = imgDF.withColumn( 'stay_days', F.datediff(F.col('allowed_stay_date'), F.col('arrival_date'))) print( f'Finished transform immigration dataset in: {(time.time() - timer):06.2f} sec.' ) print_data_info(imgDF, 'immigrationDF') # write self._write_data(imgDF, immigration_output_path, 'immigration') return True
def process(spark, input_file, target_path): df = spark.read.parquet(input_file) df = df.withColumn('click', F.when(F.col('event')=='click',1).otherwise(0))\ .withColumn('view', F.when(F.col('ad_cost_type')=='view',1).otherwise(0))\ .withColumn('is_cpm', F.when(F.col('ad_cost_type')=='CPM',1).otherwise(0))\ .withColumn('is_cpc', F.when(F.col('ad_cost_type')=='CPC',1).otherwise(0)) df_ctr = df.groupBy('ad_id')\ .agg(F.sum('click').alias('num_of_clicks'),F.sum('view').alias('num_of_views'),F.min('date').alias('min'),F.max('date').alias('max'))\ .withColumn('CTR', (col('num_of_clicks')/col('num_of_views')))\ .withColumn('day_count',F.datediff(col('max'),col('min'))) df = df.drop('date', 'time', 'event', 'platform', 'client_union_id', 'compaign_union_id', 'ad_cost_type', 'click', 'view').distinct() final_data = df_ctr.join(df, ['ad_id'], 'right').select('ad_id', 'target_audience_count', 'has_video', 'is_cpm', 'is_cpc', 'ad_cost', 'day_count', 'CTR') a, b, c, = final_data.randomSplit([0.5, 0.25, 0.25]) a.coalesce(1).write.parquet(target_path + '/train') b.coalesce(1).write.parquet(target_path + '/test') c.coalesce(1).write.parquet(target_path + '/validate')
def build_static_features(self): ''' Features: gender state useragent customer_age (for how long the customer has joined) ''' gender = self.df.select('userId', F.when(F.col('gender')=="F", 1).otherwise(0).alias('female')).distinct() state = self.one_hot_encode(self.df.select('userId', F.explode(self.extract_state('location')).alias('state')).distinct(), 'state') customer_age = self.df\ .select('userId',self.clean_ts('registration').alias("registration"))\ .select('userId', F.datediff(F.to_date(F.lit(self.max_time),"yyyy-MM-dd"), F.to_date('registration',"yyyy-MM-dd")).alias('customer_age'))\ .distinct() agent_one_hot = self.one_hot_encode(self.df\ .select('userId', F.explode(self.clean_userAgent('userAgent')))\ .distinct()\ .withColumnRenamed('col','agent'), 'agent') return gender\ # .join(state, on="userId", how='outer')\ .join(customer_age, on="userId", how='outer')\ .join(agent_one_hot, on="userId", how='outer')
def get_new_features(df: DataFrame): return (df.withColumn( "AgeGroup", f.concat((f.col("AgeGroup") * 5).cast("int").cast("string"), f.lit("-"), (f.col("AgeGroup") * 5 + 5).cast("int").cast("string")), ).withColumn( "MaturityDateDelay", f.datediff("MaturityDate_Last", "MaturityDate_Original")).withColumn( "AmountNotGranted", f.col("AppliedAmount") - f.col("Amount")).withColumn( "DaysToFirstPayment", f.datediff("FirstPaymentDate", "LoanDate")).drop("MaturityDate_Original", "AppliedAmount", "FirstPaymentDate"))
def transform_immigration_dataset(self): df_spark=spark.read.parquet('../workspace/immigration_files/sas_data/*.parquet') df_immi_fact=df_spark.fillna('Not available', subset=['gender','occup','i94addr']) df_date=df_immi_fact.withColumn("arrival_date", udf_datetime_from_sas("arrdate")).withColumn("departure_date", udf_datetime_from_sas("depdate")) df_date=df_date.withColumn("cicid", df_date["cicid"].cast(IntegerType())) \ .withColumn("i94yr",df_date["i94yr"].cast(IntegerType())) \ .withColumn("i94mon",df_date["i94mon"].cast(IntegerType())) \ .withColumn("i94cit",df_date["i94cit"].cast(IntegerType())) \ .withColumn("i94res",df_date["i94res"].cast(IntegerType())) \ .withColumn("i94mode",df_date["i94mode"].cast(IntegerType())) \ .withColumn("i94addr",df_date["i94addr"])\ .withColumn("i94bir",df_date["i94bir"].cast(IntegerType()))\ .withColumn("i94visa",df_date["i94visa"].cast(IntegerType()))\ .withColumn("count",df_date["count"].cast(IntegerType()))\ .withColumn("biryear",df_date["biryear"].cast(IntegerType()))\ .withColumn("no_of_days_stayed", datediff(col("departure_date"),col("arrival_date"))) immigration=df_date.select('cicid','i94yr','i94mon',\ 'i94cit','i94res','i94port','i94mode',\ 'i94addr','i94bir','i94visa',\ 'visapost','entdepa','biryear',\ 'gender','airline','visatype','arrival_date',\ 'departure_date','no_of_days_stayed') immigration.write.partitionBy("i94mon").mode("append").parquet(save_path +"/sasdata/")
def age_calculator(self, columns, age_colname): """ @columns: a string of column name @age_colname: a string of the new column of age, e.g.'age' """ assert isinstance(columns, str), "Error: columns argument must be a string!" assert isinstance( age_colname, str), "Error: the name for age column argument must be a string!" df = self._df.select( columns, from_unixtime(unix_timestamp(columns, 'MM/dd/yyy')).alias('with time')) df = df.withColumn('without time', df['with time'].cast(DateType())) df = df.withColumnRenamed(columns, 'tmp') df = df.withColumn(age_colname, ( (datediff(current_date(), col('without time')).cast(FloatType())) / 365).cast(IntegerType())) self._df = self._df.join( df, col(columns) == col('tmp'), 'leftouter').drop('tmp').drop('with time').drop('without time') return self
def preprocessing(spark_df): smart_feature_columns=[column for column in spark_df.columns if 'smart' in column] window_spec_7 = Window.partitionBy('model', 'serial_number').orderBy( F.datediff(F.col('dt'), F.lit('2017-07-01'))).rangeBetween(-7, 0) prefix_window7='window_7_' for smart_col in smart_feature_columns: spark_df=spark_df.withColumn(smart_col,F.col(smart_col).cast(DoubleType())) if smart_col in ['smart_1_normalized','smart_5raw','smart_7_normalized','smart_194raw','smart_199raw', 'smart_190raw','smart_191raw','smart_193raw','smart_195_normalized','smart_195raw']: spark_df = spark_df.withColumn(prefix_window7 + 'range_' + smart_col, F.max(F.col(smart_col)).over(window_spec_7) - F.min(F.col(smart_col)).over( window_spec_7)) spark_df = spark_df.withColumn(prefix_window7 + 'std_' + smart_col, F.stddev(F.col(smart_col)).over(window_spec_7)) #if smart_col in ['smart_187raw','smart_188raw','smart_197raw','smart_198raw']: # spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)>0,1).otherwise(0)) #if smart_col in ['smart_187_normalized','smart_188_normalized','smart_197_normalized','smart_198_normalized']: # spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)<100,1).otherwise(0)) if smart_col in ['smart_4raw','smart_5raw','smart_191raw', 'smart_187raw','smart_197raw','smart_198raw', 'smart_199raw','window_7_range_smart_199raw']: spark_df=spark_df.withColumn(smart_col,F.log2(F.col(smart_col)+F.lit(1.))) spark_df=spark_df.withColumn('smart_199raw',F.col('smart_199raw')*F.col('window_7_range_smart_199raw')) spark_df = spark_df.withColumn('anomaly_sum', F.col('smart_4raw') / 12 + F.col('smart_5raw') / 16 + F.col('smart_191raw') / 18 + F.col('smart_198raw')/18 +F.col('smart_197raw')/18+F.col('smart_187raw')/15) return spark_df
def add_datediff(df, date_col, start_date): """添加日期序号 """ df = df.withColumn('datediff', F.datediff(F.col(date_col), F.lit(start_date))) df = df.withColumn('datediff_square', F.pow(F.col('datediff'), 2)) df = df.withColumn('datediff_square_root', F.pow(F.col('datediff'), 0.5)) return df
def get_target(df: DataFrame): w = Window.partitionBy("LoanID").orderBy("Date") return (df.withColumn( "nextPaymentDiff", f.datediff( "Date", f.lag("Date").over(w))).fillna(0, subset=[ "nextPaymentDiff" ]).withColumn("DaysFromStart", f.datediff( "Date", "LoanDate")).filter( f.col("DaysFromStart") <= default_prediction).withColumn( "label", f.when(f.col("nextPaymentDiff") < default_days, 0).otherwise(1)).withColumn( "FeaturesForPrediction", f.when( f.col("DaysFromStart") <= observation_period, 1).otherwise(0)))
def countUserRegTime(accountdf, df): """统计某一天的用户注册时长 从全部注册用户里筛选出当天上线的用户,算出2个日期差, 然后统计不同日期差里的用户数量并进行排序 """ return df.join(accountdf, df.uid == accountdf.uid, 'inner').select( F.datediff(df.day, accountdf.regtime).alias('daydiff')).groupBy( 'daydiff').count().sort(F.asc('daydiff'))
def __init__(self, df): ''' Intake the event data; print out summary and build the first level cleaned data ''' print("Numbers of rows in the data: {:,}".format(df.count())) print("Numbers of rows with N/A user Ids: {:,}".format(df.filter("userId is Null").count())) print("Numbers of unique customers: {:,}".format(df.select('userId').distinct().count())) self.df = df self.clean_userAgent = udf(self.clean_userAgent, ArrayType(StringType())) self.clean_ts = udf(self.clean_ts, StringType()) self.clean_ts_hour = udf(self.clean_ts_hour, StringType()) self.extract_state = udf(self.extract_state, ArrayType(StringType())) self.user_activities = self.df\ .filter("auth = 'Logged In'")\ .select('userId', 'registration', self.clean_ts('registration').alias("registration_ts"), 'level', F.col('ts').alias('time'), self.clean_ts('ts').alias("timestamp"), self.clean_ts_hour('ts').alias("time_in_the_day"), 'sessionId', 'itemInSession', 'page', 'artist', F.concat(F.col('artist'), F.lit(" - ") , F.col('song')).alias("singer-song"), 'length' )\ .withColumn('days_since_registration', F.datediff(F.to_date('timestamp',"yyyy-MM-dd"), F.to_date('registration_ts',"yyyy-MM-dd"))) self.max_time = self.user_activities\ .groupby()\ .max('time')\ .select(self.clean_ts('max(time)').alias('max_time'))\ .collect()[0]['max_time'] self.user_activities = self.user_activities.withColumn('days_before_today', F.datediff(F.to_date(F.lit(self.max_time),"yyyy-MM-dd"), F.to_date('timestamp',"yyyy-MM-dd")))
def user_sessions_duration_less_than(duration: int, data: DataFrame): rows: List[Row] = data \ .groupBy('session_id') \ .agg(datediff(min(to_date(col('datetime'))), max(to_date(col('datetime')))).alias('session_duration')) \ .filter(col('session_duration') <= duration) \ .drop("session_duration").collect() users: List[str] = [str(row.session_id) for row in rows] return users
def add_date_info_spark(df, convert=True): '''Take a spark df and add date info ''' if convert: f_datestring = F.UserDefinedFunction(lambda x: x[:-4] + '+00:00', StringType()) df = df.withColumn("first_event", f_datestring(df.first_event)) df = df.withColumn("last_event", f_datestring(df.last_event)) df = df.withColumn("first_event", F.to_timestamp(df.first_event)) df = df.withColumn("last_event", F.to_timestamp(df.last_event)) df = df.withColumn("created_at", F.to_timestamp(df.created_at)) df = df.withColumn("updated_at", F.to_timestamp(df.updated_at)) df = df.withColumn("recency", F.datediff(df.last_event, df.created_at)) df = df.withColumn("time_between_first_last_event", F.datediff(df.last_event, df.first_event)) return df
def _discretize_time(self, column: sf.Column) -> sf.Column: days_since_study_start = sf.datediff(column, sf.lit(self.study_start)) bucket = sf.floor(days_since_study_start / self.bucket_size).cast("int") if self.bucket_rounding == "floor": bucket = (sf.when( (bucket < self.n_buckets) | bucket.isNull(), bucket).otherwise(self.n_buckets - 1).cast("int")) return bucket
def add_features(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: print("[wrangle.py] adding features") max_date = df.select(max("case_closed_date")).first()[0] return ( df.withColumn("num_weeks_late", expr("num_days_late / 7 AS num_weeks_late")) .withColumn( "council_district", format_string("%03d", col("council_district").cast("int")), ) .withColumn("zipcode", regexp_extract("request_address", r"\d+$", 0)) .withColumn("case_age", datediff(lit(max_date), "case_opened_date")) .withColumn("days_to_closed", datediff("case_closed_date", "case_opened_date")) .withColumn( "case_lifetime", when(expr("! case_closed"), col("case_age")).otherwise( col("days_to_closed") ), ) )
def get_crm_cust_log(spark): #crm_cust_log 变更记录表 # change_type: CO_CUST.STATUS 状态变更 # CO_CUST.CUST_SEG 档位变更 crm_cust_log = spark.sql("select cust_id,change_type,change_frm,change_to,audit_date from DB2_DB2INST1_CRM_CUST_LOG") \ .withColumn("audit_date", f.to_date("audit_date", "yyyyMMdd")) \ .withColumn("day_diff",f.datediff(f.current_date(), col("audit_date"))) \ .withColumn("change_frm", f.regexp_replace(col("change_frm"), "(zz)|(ZZ)", "31")) \ .withColumn("change_to", f.regexp_replace(col("change_to"), "(zz)|(ZZ)", "31")) return crm_cust_log
def week_diff(end, start): """ 计算两个日期的相隔周数 :param end: :param start: :return: """ # date_trunc("week",date) 返回 所属周的周一的日期 return datediff(date_trunc("week", col(end)), date_trunc( "week", col(start))) / 7
def set_years_between_dates(self, dataframe, start_date, end_date, years_between_field): """Produces a PySpark dataframe containing a field representing the number of years between two specified fields. :param dataframe: the PySpark dataframe :param start_date: the field containing the start date to use in calculating the years between fields :param end_date: the field containing the end date to use in calculating the years between fields :param years_between_field: the name of the field to create :returns: the PySpark dataframe containing the new field and all fields in the supplied dataframe """ return(dataframe.withColumn(years_between_field, datediff(dataframe[end_date], dataframe[start_date])/365.25))
def inner(df): from pyspark.sql.functions import lag, col, datediff w = Window().partitionBy().orderBy(col(colName)) df1 = (df.select(colName, lag(colName).over(w).alias("lagged_col")).na.drop()) (df1.withColumn('diff_col', datediff( df1[colName], df1['lagged_col'])).sort('diff_col', ascending=False).show()) return df
def i94_data(month, year): ''' This function creates a parquet file from immigration data ''' df_spark =spark.read.format('com.github.saurfang.sas.spark').load(f'../../data/18-83510-I94-Data-2016/i94_{month}{year}_sub.sas7bdat').drop_duplicates() df_spark = df_spark.where(col("dtadfile").isNotNull()) df_spark = df_spark.withColumn('stayed_days', ( df_spark['depdate'] - df_spark['arrdate'])) df_spark = df_spark.withColumn("allowed_stay_till", to_date("dtaddto","MMddyyyy")) df_spark = df_spark.withColumn('date_created', to_date("dtadfile", 'yyyyMMdd')) df_spark = df_spark.withColumn("allowed_stay_days", datediff('allowed_stay_till', 'date_created')) df_spark.createOrReplaceTempView('i94') query = """SELECT cicid, date_created, i94yr AS year, i94mon AS month, i94cit AS citizenship, i94res AS resident, i94bir AS age, biryear AS birth_year, gender, occup AS occupation, allowed_stay_till, allowed_stay_days, stayed_days, i94visa AS visa_class, visatype AS visa_type, i94port AS port, i94mode AS mode, i94addr AS arraval_state, visapost AS visa_issued_by, entdepa AS arrival_flag, entdepd AS depart_flag, entdepu AS update_flag, matflag AS match_flag, insnum AS ins_number, airline, admnum AS admission_number, fltno AS flight_number FROM i94 """ i94 = spark.sql(query) i94 = i94.withColumn("cicid", i94["cicid"].cast(IntegerType())) i94 = i94.withColumn("year", i94["year"].cast(IntegerType())) i94 = i94.withColumn("month", i94["month"].cast(IntegerType())) i94 = i94.withColumn("citizenship", i94["citizenship"].cast(IntegerType())) i94 = i94.withColumn("resident", i94["resident"].cast(IntegerType())) i94 = i94.withColumn("age", i94["age"].cast(IntegerType())) i94 = i94.withColumn("birth_year", i94["birth_year"].cast(IntegerType())) i94 = i94.withColumn("stayed_days", i94["stayed_days"].cast(IntegerType())) i94 = i94.withColumn("visa_class", i94["visa_class"].cast(IntegerType())) i94 = i94.withColumn("mode", i94["mode"].cast(IntegerType())) i94 = i94.withColumn("admission_number", i94["admission_number"].cast(IntegerType())) i94.write.mode('overwrite').partitionBy("month", "year").parquet(output_data + "immigration/") print('Immigration data was saved in parquet format on S3')
def search_clients_daily(main_summary): return agg_search_data( main_summary, ["client_id", "submission_date", "engine", "source"], list( map( agg_first, [ "country", "app_version", "distribution_id", "locale", "user_pref_browser_search_region", "search_cohort", "addon_version", "os", "os_version", "channel", "profile_creation_date", "default_search_engine", "default_search_engine_data_load_path", "default_search_engine_data_submission_url", "sample_id", ], ) ) + [ # Count of 'first' subsessions seen for this client_day ( count(when(col("subsession_counter") == 1, 1)).alias( "sessions_started_on_this_day" ) ), first( datediff( "subsession_start_date", from_unixtime(col("profile_creation_date") * 24 * 60 * 60), ) ).alias("profile_age_in_days"), sum(col("subsession_length") / 3600.0).alias("subsession_hours_sum"), mean(size("active_addons")).alias("active_addons_count_mean"), ( max("scalar_parent_browser_engagement_max_concurrent_tab_count").alias( "max_concurrent_tab_count_max" ) ), ( sum("scalar_parent_browser_engagement_tab_open_event_count").alias( "tab_open_event_count_sum" ) ), (sum(col("active_ticks") * 5 / 3600.0).alias("active_hours_sum")), ], )
def add_duration_column(df): """ Calculate the duration between arrival date and departement date and store in column. Parameters ---------- df: pyspark.sql.dataframe.DataFrame """ logger.info('adding duration column') df = df.withColumn("arrival_dt", F.expr("date_add(to_date('1960-01-01'), arrdate)")) df = df.withColumn("depart_dt", F.expr("date_add(to_date('1960-01-01'), depdate)")) return df.withColumn("length_stay", F.datediff("depart_dt", "arrival_dt"))
def weigthed_size(min_date, max_date, begin, end): """ A vectorized approach to calcule the weigth of a size in a given period. @param x spark dataframe @param begin first day of the period (as lit column). @param end last day of the period (as lit column). """ _start = when(min_date >= begin, min_date).otherwise(begin) _end = when((max_date < end), max_date).otherwise(end) delta = datediff( from_unixtime(unix_timestamp(_end, "yyyyMMdd")), from_unixtime(unix_timestamp(_start, "yyyyMMdd")), ) + lit(1) delta = when((max_date < begin) | (min_date > end), lit(0)).otherwise(delta) period = datediff( from_unixtime(unix_timestamp(end, "yyyyMMdd")), from_unixtime(unix_timestamp(begin, "yyyyMMdd")), ) + lit(1) x = delta.cast(DoubleType()) / period.cast(DoubleType()) return x
def get_lot_purchases(timeframe, partner, purchase): """ returns the distribution of length of trip for farekeep sold by a partner over a time period ending today input: a period (string), a partner (string), a spark dataframe output: a dictionary with the bucket and the values associated """ result = {} timeframe_is = get_date(timeframe) new_df = (purchase.filter(purchase.keen.timestamp >= timeframe_is) .filter(purchase.search_info.partner_id == partner) .filter(purchase.flight.inbound.departure_datetime != 'NaN') .withColumn("length_of_trip", datediff(purchase.flight.inbound.departure_datetime, purchase.flight.outbound.departure_datetime))) lots = new_df.groupBy('length_of_trip').sum('purchase.quantity').collect() for row in lots: result[row[0]] = row[1] return result
def get_lot_searches(timeframe, partner, premium): """ returns the distribution of length of trip for farekeep searched by a partner over a time period ending today input: a period (string), a partner (string), a spark dataframe output: a dictionary with the bucket and the values associated """ result = {} timeframe_is = get_date(timeframe) # look for datediff in spark (working with string) new_df = (premium.filter(premium.keen.timestamp >= timeframe_is) .filter(premium.search_info.partner_id == partner) .filter(premium.flight.is_one_way == False) .withColumn("length_of_trip", datediff(premium.flight.return_date, premium.flight.departure_date))) lots = new_df.groupBy('length_of_trip').sum('flight.ticket_quantity').collect() for row in lots: result[row[0]] = row[1] return result
def featurizeData(raw, gap, vocabFile, featFile): feats = raw.dropDuplicates(['cluster', 'series', 'date'])\ .withColumn('day', datediff(col('date'), lit('1970-01-01')))\ .na.drop(subset=['day'])\ .rdd.groupBy(lambda r: r.cluster)\ .flatMap(lambda c: clusterFeatures(c, gap))\ .toDF() feats.cache() cv = CountVectorizer(inputCol='raw', outputCol='features', minDF=4.0) interner = cv.fit(feats) # alternate possibility: grab features only from label==1 edges full = interner.transform(feats) # combiner = VectorAssembler(inputCols=realCols + ['categorial'], outputCol='features') # # I don't think a Pipeline will work here since we need to get the interner.vocabulary # full = combiner.transform(interner.transform(feats)).drop('categorial') full.write.parquet(featFile) np.savetxt(vocabFile, np.array(interner.vocabulary), fmt='%s') feats.unpersist()
.withColumn("today", current_date())\ .withColumn("now", current_timestamp()) dateDF.createOrReplaceTempView("dateTable") # COMMAND ---------- from pyspark.sql.functions import date_add, date_sub dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1) # COMMAND ---------- from pyspark.sql.functions import datediff, months_between, to_date dateDF.withColumn("week_ago", date_sub(col("today"), 7))\ .select(datediff(col("week_ago"), col("today"))).show(1) dateDF.select( to_date(lit("2016-01-01")).alias("start"), to_date(lit("2017-05-22")).alias("end"))\ .select(months_between(col("start"), col("end"))).show(1) # COMMAND ---------- from pyspark.sql.functions import to_date, lit spark.range(5).withColumn("date", lit("2017-01-01"))\ .select(to_date(col("date"))).show(1) # COMMAND ----------