def week_diff(end, start): """ 计算两个日期的相隔周数 :param end: :param start: :return: """ # date_trunc("week",date) 返回 所属周的周一的日期 return datediff(date_trunc("week", col(end)), date_trunc( "week", col(start))) / 7
def _avg(self, data, avg_over): """Average the data over a time period. Args: data (obj): data to average. avg_over (str): periodicity specification. Returns: Averaged Spark DataFrame """ features = data.columns features.remove("datetime") new_df = data.withColumn(avg_over, F.date_trunc(avg_over, data["datetime"])) new_df = new_df \ .groupBy(avg_over) \ .agg(*[F.avg(feat) for feat in features]) \ .orderBy(avg_over) for feat in features: new_df = new_df.withColumnRenamed("avg({})".format(feat), feat) new_df = new_df.withColumnRenamed(avg_over, "datetime") return new_df
def make_set(self, setname, align_dates, feats): """Make a new Spark DataFrame by picking the relevant features from the different data. Users must specify the align_dates argument which makes all selected features from the different data to be time consistent with one another. i.e. make everything daily. Args: setname (str): Name of the new dataset. align_dates (str): Options for time consistency: "days", "weeks", "months" feats (dict): Dictionary of list of features and the data it orginates from: {data:[feat1, feat2, feat3]} """ set_dict = {} for data, feat_list in feats.items(): for feat in feat_list: set_dict[feat] = self.dicts[data][feat].select( F.date_trunc( align_dates, self.dicts[data][feat].datetime).alias("datetime"), "value") self.sets[setname] = self._dict2df(set_dict, sort=True, drop_nulls=False) # Truncated duplicates can exist in dataframe, so lets drop these samples. self.sets[setname] = self.sets[setname].dropDuplicates(["datetime"])
def aggregateSalesRevenueDeltaBacked(updatesDF, epochId): # Sum up the new incoming keys incomingSalesAggregateDF = (updatesDF.withColumn( "timestamp", f.date_trunc("minute", "timestamp")).groupBy( f.col("timestamp"), f.col("item_id")).agg(f.sum("sales").alias("sales"))) targetTable = DeltaTable.forName(spark, "sw_db.delta_backed_state") # We merge the new sales with the already existing sales. # We simulate a watermark by only retrieving timestamp records greater than max seen timestamp - 5 minutes # Note that it is even better to partition the state by date if you have days worth of data, to skip over entire partitions, # when pushing down the predicate. mostRecentTimestamp = targetTable.toDF().select( f.max("timestamp").alias("max_timestamp")).head().max_timestamp watermarkTime = mostRecentTimestamp - timedelta( minutes=5) if mostRecentTimestamp else datetime.min (targetTable.alias("target").merge( incomingSalesAggregateDF.alias("source"), f""" target.item_id = source.item_id AND target.timestamp = source.timestamp AND target.timestamp > cast('{watermarkTime}' AS TIMESTAMP) AND source.timestamp > cast('{watermarkTime}' AS TIMESTAMP) """).whenMatchedUpdate( set={ "sales": f.col("source.sales") + f.col("target.sales") }).whenNotMatchedInsertAll().execute())
def get_daily_avg_v(self): data = self.data daily_agg = data.withColumn('date', functions.date_trunc('dd', data['timestamp'])) daily_avg = daily_agg.groupby('date').avg('Volume BTC') result = daily_avg.select("date", "avg(Volume BTC)").orderBy('date') result.cache() return get_label_value(result.collect())
def save_comment_per_month(session, comments, base_name): counts_per_month = comments\ .select(date_trunc("month", comments.created).alias("month"))\ .groupby("month")\ .count() counts_per_month.write.csv( "reddit/%s-%s.csv" % (base_name, session.sparkContext.applicationId) )
def county_reality_supply(): # 各区县各档位该品规上周投放量 try: print(f"{str(dt.now())} 各区县各档位该品规上周投放量") plm_item = get_plm_item(spark).select("item_id", "item_name") co_cust = get_co_cust(spark).select("cust_id", "sale_center_id", "cust_seg") area = get_area(spark) # com_id与city的映射关系 city = area.dropDuplicates(["com_id"]).select("com_id", "city") # sale_center_id与区(list)的映射关系 county = area.groupBy("sale_center_id") \ .agg(f.collect_list("county").alias("county")) \ .select("sale_center_id", "county") # 获取上周实际投放量 # cust_item_spw = spark.sql( # "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ # .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \ # .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \ # .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \ # .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \ # .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\ # .join(co_cust,"cust_id") cust_item_spw = spark.sql( "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \ .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \ .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \ .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \ .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\ .join(co_cust,"cust_id") #需要计算的值的列名 colName = "county_gauge_week_volume" result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \ .agg(f.sum("qty_allocco").alias(colName)) columns = [ "com_id", "city", "sale_center_id", "county", "gears", "gauge_id", "gauge_name", "city", "gears_data_marker", colName ] result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \ .withColumn("gears_data_marker", f.lit("4")) \ .join(plm_item, "item_id") \ .join(city, "com_id") \ .join(county,"sale_center_id")\ .withColumnRenamed("item_id","gauge_id")\ .withColumnRenamed("item_name","gauge_name")\ .withColumnRenamed("cust_seg","gears")\ .foreachPartition(lambda x: write_hbase1(x, columns, hbase)) except Exception: tb.print_exc()
def process_log_data(spark, input_data, output_data): """Loads data from s3 to spark creates user, songplays and time dataframe write back the user, songplays and time dataframe to s3 in parquet format""" # read log data file df = spark.read.json(input_data + 'log_data/2018/11/*.json') df.persist() # filter by actions for song plays df = df.filter(df_event.page == "NextSong") # extract columns for users table user_table = df.dropDuplicates(['userId']) \ .select(['userId','firstName','lastName','gender','level']) # write users table to parquet files user_table.write.parquet(output_data + 'udacity/user') # create start_time column from original timestamp column df = df.withColumn('start_time', (df_event['ts'] / 1000).cast(types.TimestampType())) # read in song data to use for songplays table song_df = spark.read.json(input_data + "song_data/A/*/*/*.json") # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, \ [song_df.artist_name == df.artist, song_df.duration == df.length, song_df.title == df.song],'inner') \ .select(['start_time','userId','level','song_id','artist_id','sessionId','location','userAgent']) # Adding year and month for partitioning the dataframe songplays_table = songplays_table.withColumn('month',F.month('start_time')) \ .withColumn('year',F.year('start_time')) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy( 'year', 'month').parquet(output_data + 'udacity/songplay') # create time column list time_column_list = ['start_time', F.hour('start_time').alias('hour'),\ F.dayofmonth('start_time').alias('day'),\ F.date_trunc('week', 'start_time').alias('week'),\ F.month('start_time').alias('month'),\ F.year('start_time').alias('year'), \ F.dayofweek('start_time').alias('weekday')] # extract columns to create time table time_table = songplays_table.select(*time_columns) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet(output_data + 'udacity/time')
def compile_date_truncate(t, expr, scope, **kwargs): op = expr.op() try: unit = _time_unit_mapping[op.unit] except KeyError: raise com.UnsupportedOperationError( '{!r} unit is not supported in timestamp truncate'.format(op.unit)) src_column = t.translate(op.arg, scope) return F.date_trunc(unit, src_column)
def comment_spoilers_per_subreddit(session, spoiler_comments, month=None): if month: comments_with_month = spoiler_comments\ .withColumn("month", date_trunc("month", spoiler_comments.created)) spoiler_comments = comments_with_month\ .filter(comments_with_month.month == month) spoiler_counts_per_subreddit = spoiler_comments.groupby("subreddit")\ .count()\ .sort(desc("count")) spoiler_counts_per_subreddit.write.csv( "reddit/spoilers_per_subreddit-%s.csv" % session.sparkContext.applicationId ) return spoiler_counts_per_subreddit
def predict(): test = spark.read.csv(STREAMED_FILENAME, header=True, mode="DROPMALFORMED") test = test.select(F.col("timestamp"), cleanText(F.col("text"))) messages = test.toPandas() times = test.select("timestamp") test = test.drop("timestamp") toxicTagger = PipelineModel.load(MODEL_PATH) predictions = toxicTagger.transform(test).select(F.col("prediction")) testIndex = predictions.withColumn("id", F.monotonically_increasing_id()) timesIndex = times.withColumn("id", F.monotonically_increasing_id()) tagged = timesIndex.join(testIndex, "id", "inner").drop("id") tagged = tagged.withColumn( "datetime", F.from_unixtime(F.unix_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss"))) tagged = tagged.select(F.col("datetime"), F.col("prediction")) minutes = tagged.withColumn( "timestamp", F.date_trunc("minute", F.col("datetime").cast("timestamp"))) hours = tagged.withColumn( "timestamp", F.date_trunc("hour", F.col("datetime").cast("timestamp"))) minutes = minutes.select(F.col("timestamp"), F.col("prediction")) hours = hours.select(F.col("timestamp"), F.col("prediction")) resultMinutes = minutes.groupBy("timestamp").mean("prediction").sort( F.col("timestamp").asc()) resultHours = hours.groupBy("timestamp").mean("prediction").sort( F.col("timestamp").asc()) resultMinutes = resultMinutes.na.drop( subset=["timestamp", "avg(prediction)"]) resultHours = resultHours.na.drop(subset=["timestamp", "avg(prediction)"]) resultMinutes, resultHours = resultMinutes.toPandas( ), resultHours.toPandas() resultMinutes['timestamp'] = pd.to_datetime(resultMinutes.timestamp) resultHours['timestamp'] = pd.to_datetime(resultHours.timestamp) resultMinutes.columns = ['timestamp', 'prediction'] resultHours.columns = ['timestamp', 'prediction'] messages['timestamp'] = pd.to_datetime(messages.timestamp) return resultMinutes, resultHours, messages
def to_friday(df, dt): '''Convert all days to Fridays Note that this is not forward mapping as Saturday and Sunday are mapped backward. This is fine since all the data are supposed to be in "business days" To be consistent over the weekends, use the timestamps''' cols = df.columns # Convert all days to Friday of the week df = df.withColumn('friday', F.date_add(F.date_trunc('week', dt), 4)) # Keep only the last record in each week w = Window.partitionBy('friday').orderBy(F.col(dt).desc()) df = df.withColumn('rn', F.row_number().over(w)).where(F.col('rn') == 1) df = df.drop(dt).withColumnRenamed('friday', dt) return df.select(*cols)
def create_vars(df, cells): # Loading variables df = df.withColumn("call_datetime", to_timestamp("call_datetime","dd/MM/yyyy HH:mm:ss")) #get call_date from call_datetime df = df.withColumn('call_date', df.call_datetime.cast('date')) # Recreate analysis variables df = df.join(cells, df.location_id == cells.cell_id, how = 'left').drop('cell_id')\ .orderBy('msisdn', 'call_datetime')\ .withColumn('region_lag', F.lag('region').over(user_window))\ .withColumn('region_lead', F.lead('region').over(user_window))\ .withColumn('call_datetime_lag', F.lag('call_datetime').over(user_window))\ .withColumn('call_datetime_lead', F.lead('call_datetime').over(user_window))\ .withColumn('hour_of_day', F.hour('call_datetime').cast('byte'))\ .withColumn('hour', F.date_trunc('hour', F.col('call_datetime')))\ .withColumn('week', F.date_trunc('week', F.col('call_datetime')))\ .withColumn('month', F.date_trunc('month', F.col('call_datetime')))\ .withColumn('constant', F.lit(1).cast('byte'))\ .withColumn('day', F.date_trunc('day', F.col('call_datetime')))\ .na.fill({'region' : missing_value_code , 'region_lag' : missing_value_code , 'region_lead' : missing_value_code }) return df
def _avg(self, data, avg_over): features = data.columns features.remove("datetime") new_df = data.withColumn(avg_over, F.date_trunc(avg_over, data["datetime"])) new_df = new_df \ .groupBy(avg_over) \ .agg(*[F.avg(feat) for feat in features]) \ .orderBy(avg_over) for feat in features: new_df = new_df.withColumnRenamed("avg({})".format(feat), feat) new_df = new_df.withColumnRenamed(avg_over, "datetime") return new_df
def execute(spark: SparkSession, log: logging, config: dict): log.info("extract") params = config['params'] ps_conf = config['postgres'] ts: datetime.datetime = params['ts'] in_path = ts.strftime(params['in_path']) ts_from = config['ts_from'] ts_to = config['ts_to'] df = spark.read.csv(in_path, header=True, sep=';') df.select( F.col('FROM_PHONE_NUMBER'), F.col('TO_PHONE_NUMBER'), F.to_timestamp(df['START_TIME'], 'dd/MM/yyyy HH:mm:ss').alias('START_TIME'), F.col('CALL_DURATION').cast('long'), F.col('IMEI'), F.col('LOCATION') ).withColumn("TS", F.date_format(F.date_trunc("hour", "START_TIME"), "yyyy-MM-dd-HH")) df.write.partitionBy("TS").mode('append').format('hive').saveAsTable('task_02') df = spark.sql("select * from task_02 where TS >= {} AND TS < {}".format(ts_from, ts_to)).drop_duplicates() df.cache() ts = df.select("TS").rdd.map(lambda x: x[0]).first() # Number of call, total call duration. num_call = df.count() total_call_duration = list(df.select(F.sum(df['CALL_DURATION'])).first().asDict().values())[0] # Number of call in working hour (8am to 5pm) num_call_working_hour = df.filter("hour(START_TIME) >= 8 AND hour(START_TIME) <= 17").count() # Find the IMEI which make most call. imei_most = df.groupBy('IMEI').count().sort(F.col("count").desc()).first().asDict() # Find top 2 locations which make most call. locations = list(map(lambda x: x.asDict(), df.groupBy('LOCATION').count().sort(F.col("count").desc()).head(2))) rs = (ts, num_call, total_call_duration, num_call_working_hour, imei_most, locations) with get_postgres_cli(ps_conf) as ps_cli: with ps_cli.cursor() as cur: sql = """ INSERT INTO metric_hour( ts, num_call, total_call_duration, num_call_working_hour, imei_most, locations ) VALUES(%s, %s, %s, %s, %s, %s) ON CONFLICT (ts) DO UPDATE SET( num_call, total_call_duration, num_call_working_hour, imei_most, locations) = (EXCLUDED.num_call, EXCLUDED.total_call_duration, EXCLUDED.num_call_working_hour EXCLUDED.imei_most, EXCLUDED.locations) """ cur.execute(sql, rs)
def make_set(self, setname, align_dates, feats): # feats={data:[feat]} set_dict = {} for data, feat_list in feats.items(): for feat in feat_list: set_dict[feat] = self.dicts[data][feat].select( F.date_trunc( align_dates, self.dicts[data][feat].datetime).alias("datetime"), "value") self.sets[setname] = self._dict2df(set_dict, sort=True, drop_nulls=False) # Truncated duplicates can exist in dataframe, so lets drop these samples. self.sets[setname] = self.sets[setname].dropDuplicates(["datetime"])
def execute(spark: SparkSession, log: logging, config: dict): log.info("extract") in_path = config['params']['in_path'] out_path = config['params']['out_path'] df = spark.read.csv(in_path, header=True).repartition(120, "PHONE_NUMBER").na.fill( {'DEACTIVATION_DATE': '9999-12-31'}) log.info("transform") df_norm = df.sort(df.DEACTIVATION_DATE.desc()).groupby( ['PHONE_NUMBER'] ).agg( F.collect_list(df['ACTIVATION_DATE']).alias('ACTIVATION_DATE'), F.collect_list(df['DEACTIVATION_DATE']).alias('DEACTIVATION_DATE') ).withColumn( 'ACTUAL_ACTIVE_DATE', udf_actual_active_date(F.col('ACTIVATION_DATE'), F.col('DEACTIVATION_DATE')) ).select(['PHONE_NUMBER', 'ACTUAL_ACTIVE_DATE']).withColumn( "TS", F.date_format(F.date_trunc("month", "ACTUAL_ACTIVE_DATE"), "yyyy-MM")) log.info("load") df_norm.write.partitionBy("TS").parquet(out_path, mode="overwrite") spark.read.parquet(out_path)
def persist_weather(wthr): """ Create and cache dataframe of weather data. Round hourly temp to nearest 10 def F Divide rainfall into none, light (<=0.2 in), and heavy bins Round timestamp to nearest hour """ wthr = wthr \ .select('date', 'tdry', 'precip') \ .filter(wthr.station == '72534014819') \ .filter(wthr.report == 'FM-15') \ .fillna({'precip':0}) wthr = wthr \ .withColumn('trnd', sf.round(wthr.tdry/10)*10) \ .withColumn('prnd', sf.when(wthr.precip == 0, 0) \ .when(wthr.precip.between(0,0.2), 0.2) \ .otherwise(1)) \ .withColumn('timernd', sf.date_trunc("Hour", wthr.date)) \ .withColumn('day', (sf.date_format(wthr.date, 'u')).cast('int')) \ .withColumn('hour', sf.hour(wthr.date)) \ .drop('tdry', 'precip', 'date') wthr = wthr.cache() return wthr
def persist_cabs(cabs): """Filter, calculate columns, partition on start time and cache cab df.""" cabs = cabs \ .filter(cabs.trip_tot < 500) \ .select(['taxi', 'start_str', 'comm_pick', 'dur', 'dist', 'fare', 'tip', 'extra']) \ .fillna(0, subset=['fare', 'tip', 'extra', 'dur', 'dist']) cabs = cabs \ .withColumn('startrnd', sf.date_trunc("Hour", sf.to_timestamp(cabs.start_str, 'MM/dd/yyyy hh:mm:ss aa'))) \ .withColumn('total', cabs.fare + cabs.tip + cabs.extra) \ .drop('start_str', 'fare', 'tip', 'extra') cabs = cabs \ .withColumn('permile', sf.when(cabs.dist > 0.2, sf.least(cabs.total / cabs.dist, sf.lit(20))) \ .otherwise(sf.lit(4))) \ .withColumn('permin', sf.when(cabs.dur > 1, sf.least(cabs.total / (cabs.dur/60), sf.lit(5))) \ .otherwise(sf.lit(1))) \ .drop('dur', 'dist') cabs = cabs.repartition(200, 'startrnd') \ .persist(StorageLevel.MEMORY_AND_DISK_SER) return cabs
def comment_spoilers_per_month_and_sub(session, spoiler_comments, limit_top_n=5): spoiler_counts = spoiler_comments\ .select(date_trunc("month", spoiler_comments.created).alias("month"), "subreddit")\ .groupby("month", "subreddit")\ .count()\ .rdd\ .groupBy(lambda row: row["month"])\ .cache() top_subs = spoiler_counts\ .mapValues(lambda values: sorted(values, key=lambda row: row["count"], reverse=True))\ .map(lambda pair: pair[1][:limit_top_n] if limit_top_n is not None else pair[1])\ .flatMap(lambda rows: [row["subreddit"] for row in rows])\ .distinct()\ .collect() top_sub_counts = spoiler_counts\ .map(lambda pair: [pair[0]] + subreddit_counts(top_subs, pair[1])) header = session.sparkContext.parallelize([["month"] + top_subs + ["others"]]) header.map(lambda row: ",".join(row)).saveAsTextFile( "reddit/spoilers_per_month_and_sub_header-%s.csv" % session.sparkContext.applicationId ) top_sub_counts.map(lambda row: ",".join(str(el) for el in row)).saveAsTextFile( "reddit/spoilers_per_month_and_sub-%s.csv" % session.sparkContext.applicationId ) spoiler_counts.unpersist()
def process_history_df_mom(self, df): """ Process function for history data, generate result dataframe that contains week, number of create events """ # There are two versions of API for CreateEvent of repository: # - One is col("payload")['object'] == 'repository' # - Another is col("payload")['ref_type'] == 'repository' # try: df_columns = df.columns df_first_record = df.first() # keyword = 'object' if 'object' in df_first_record['payload'] else 'ref_type' num_create_events_df = \ df \ .filter((col('payload')['ref_type'] == 'repository') | (col('payload')['object'] == 'repository')) \ .filter((col('type') == 'CreateEvent') | (col('type') == 'Event')) # count the number of create events happened in one week (group by week) num_create_events_by_month_df = num_create_events_df.groupby( date_trunc('month', df.created_at).alias('month_created_at')).count() # calculate the grawth rate of that day compare to last week # dulicated two dataframes, for each day in the first dataframe # find the number fo create events in the second dataframe # of a day that is 7 days before the day in the first dataframe # [df1] 2015-01-07 -> [df2] 2015-01-01 (7 days) num_create_events_by_month_df_1 = num_create_events_by_month_df.alias( 'num_create_events_by_month_df_1') num_create_events_by_month_df_1 = \ num_create_events_by_month_df_1 \ .select( col('month_created_at').alias('month_created_at_1'), col('count').alias('count_1')) num_create_events_by_month_df_2 = num_create_events_by_month_df.alias( 'num_create_events_by_month_df_2') num_create_events_by_month_df_2 = \ num_create_events_by_month_df_2 \ .select( col('month_created_at').alias('month_created_at_2'), col('count').alias('count_2')) joined_num_create_events_df = \ num_create_events_by_month_df_1 \ .withColumn( 'last_week_month_created_at', date_trunc( 'month', add_months(num_create_events_by_month_df_1.month_created_at_1, -1))) \ .join( num_create_events_by_month_df_2, col('last_week_month_created_at') == col('month_created_at_2'), how='left_outer') joined_num_create_events_df.show() joined_num_create_events_df = joined_num_create_events_df.withColumn( 'count_2', coalesce('count_2', 'count_1')) num_create_events_with_growth_rate_df = \ joined_num_create_events_df \ .withColumn( 'monthly_increase_rate', ((joined_num_create_events_df.count_1 - joined_num_create_events_df.count_2) / joined_num_create_events_df.count_2) ) \ .select( 'month_created_at_1', 'count_1', 'monthly_increase_rate') num_create_events_with_growth_rate_df.show() return num_create_events_with_growth_rate_df
def __call__(self, x: sql.Column): truncated = functions.date_trunc(self._frequency, x) return truncated.astype(self.dtype)
F.round(((col("power_out") / 10085) * 100), 1).alias("Percent Total Time Power Out"), F.round(((col("power_out") / col("total_measurements")) * 100), 1).alias("Percent Monitored Time Power Out"), F.round(((col("power_ambiguous") / 10085) * 100), 1).alias("Percent Total Time Power Ambiguous"), F.round(((col("power_ambiguous") / col("total_measurements")) * 100), 1).alias("Percent Monitored Time Power Ambiguous")) total_summary.show(20) total_summary.repartition(1).write.format("com.databricks.spark.csv").mode( 'overwrite').option("header", "true").save(args.result + '/weekly_summary') daily_summary = pw_df_resampled.groupBy( "tx", F.date_trunc('day', "time").alias("date_day")).agg( F.avg("grid_voltage").alias("average_measured_voltage"), total_measurements_in_period.alias("total_measurements"), valid_voltage_condition.alias("valid_voltage"), under_voltage_condition.alias("under_voltage"), over_voltage_condition.alias("over_voltage"), measurements_with_one_sensor.alias("one_sensor"), measurements_with_two_sensors.alias("two_sensors"), measurements_with_three_sensors.alias("three_sensors"), measurements_with_zero_sensors.alias("zero_sensors"), power_out.alias("power_out"), power_on.alias("power_on"), power_ambiguous.alias("power_ambiguous")) daily_summary = daily_summary.select( "tx", "date_day", F.round(((col("total_measurements") / 1440) * 100),
#this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_time' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.avg("restore_time").alias("restore_times_mean"),*exprs) pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_stddev", F.explode("restore_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_times_stddev' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("restore_times_stddev").alias("restore_times_stddev"),*exprs) pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_range", F.array_max("restore_times") - F.array_min("restore_times")) #Okay now to effectively calculate SAIDI/SAIFI we need to know the sensor population #join the number of sensors reporting metric above with our outage groupings #then we can calculate the relative SAIDI/SAIFI contribution of each outage pw_finalized_outages = pw_finalized_outages.join(pw_distinct_core_id, F.date_trunc("day", F.from_unixtime(pw_finalized_outages["outage_time"])) == F.date_trunc("day", pw_distinct_core_id["window_mid_point"])) pw_finalized_outages = pw_finalized_outages.select("outage_time","restore_times_mean","cluster_size","sensors_reporting","outage_times","outage_times_range","outage_times_stddev","restore_times","restore_times_range","restore_times_stddev", "location") pw_finalized_outages = pw_finalized_outages.withColumn("relative_cluster_size",col("cluster_size")/col("sensors_reporting")) pw_finalized_with_string = pw_finalized_outages.withColumn("outage_times",F.to_json("outage_times")) pw_finalized_with_string = pw_finalized_with_string.withColumn("restore_times",F.to_json("restore_times")) pw_finalized_with_string = pw_finalized_with_string.withColumn("location",F.to_json("location")) #okay we should save this pw_finalized_with_string.repartition(1).write.format("com.databricks.spark.csv").mode('overwrite').option("header", "true").save(args.result + '/full_outage_list') #We need to zero fill for every date and cluster size not already present in the dataset
df.select(F.date_format('dt', 'MM/dd/yyyy').alias('date')).show() # COMMAND ---------- # you have the following dataframe. Truncate to get the first day of the year (alias "year"), as well as a second column which gets the first day of the month (alias "month"): df = spark.createDataFrame([('1997-02-28 05:02:11', )], ['t']) # Expected: # +-------------------+-------------------+ # | year| month| # +-------------------+-------------------+ # |1997-01-01 00:00:00|1997-02-01 00:00:00| # +-------------------+-------------------+ # Answer df.select( F.date_trunc('year', df.t).alias('year'), F.date_trunc('mon', df.t).alias('month')).show() # COMMAND ---------- # You have this Pandas dataframe with the column "time" containing the dates between start_date and end_date. # 1. Convert to Pyspark # 2. Create a colunm "transactiondate" by casting the column "time" to date import pandas as pd time = pd.date_range('2018-01-01', '2018-01-03', freq='D') dfp = pd.DataFrame(columns=['time']) dfp['time'] = time # Expected: # +-------------------+---------------+
datasource1_null = datasource1.where(col("i3pl_chng_time").isNull()) datasource1_null = datasource1_null.withColumn( 'i3pl_chng_time', F.col('i3pl_chng_time').cast(TimestampType())) # #datasource1_null.select(['i3pl_chng_time']).show(10) concatenated_data = datasource1_null.union(datasource1_notnull) #concatenated_data.select(['i3pl_chng_time','source']).show(5) concatenated_data = concatenated_data.withColumn( 'DAY_OF_SHIP_RCV', F.lit(F.date_trunc("day", concatenated_data['ACTUAL_RCPT_SHIP_DATE'])).cast( TimestampType())) #concatenated_data = concatenated_data.withColumn('row_status',F.lit('inserted')) #concatenated_data.select(['row_status','i3pl_chng_time']).show(5) #concatenated_data.dtypes datasource2 = DynamicFrame.fromDF(concatenated_data, glueContext, "datasource2") applymapping1 = ApplyMapping.apply( frame=datasource2, mappings=[ ("document_type", "string", "document_type", "string"),
from pyspark.sql import functions from spark_config import spark """ Script para retornar o montante (valor) transacionado pago por ISO / merchant, por mes """ if __name__ == '__main__': df1 = spark.read.csv(path="../output/sanitize_transactions/", header=True, inferSchema=True, sep=";") df2 = df1.where(df1.status == "paid").groupBy( functions.date_trunc("month", df1.created_at).alias("transaction_month"), df1.iso_id, df1.merchant_id).agg( functions.sum(df1.valor).cast("decimal(15,2)").alias("valor")) df3 = df2.select(df2.iso_id, df1.merchant_id, df2.transaction_month, df2.valor).orderBy(df2.iso_id, df2.transaction_month, df2.valor) df3.show(truncate=False) df3.printSchema()
# now explode the outage lists so that every line is a sensor involved in that outage and regroup by transformer and feeder # then each outage maps to a number of sensors out under each transformer and feeder # This gives the relative SAIFI contribution of each transformer in each outage pw_outages_by_feeder = pw_finalized_outages.select("outage_time", "feeder_id") pw_outages_by_feeder = pw_outages_by_feeder.withColumn("feeder_id", F.explode("feeder_id")) pw_outages_by_feeder = pw_outages_by_feeder.withColumn("size", F.lit(1)) pw_outages_by_feeder = pw_outages_by_feeder.groupBy( "outage_time", "feeder_id").agg(F.sum("size").alias("cluster_size")) pw_outages_by_feeder = pw_outages_by_feeder.select( "outage_time", "cluster_size", col("feeder_id").alias("feeder_id_o")) pw_outages_by_feeder.show() pw_outages_by_feeder = pw_outages_by_feeder.join( pw_distinct_core_id_by_feeder, ((F.date_trunc("day", F.from_unixtime( pw_outages_by_feeder["outage_time"])) == F.date_trunc( "day", pw_distinct_core_id_by_feeder["window_mid_point"])) & (pw_outages_by_feeder["feeder_id_o"] == pw_distinct_core_id_by_feeder["feeder_id"]))) pw_outages_by_feeder = pw_outages_by_feeder.withColumn( "relative_cluster_size", col("cluster_size") / col("sensors_reporting")) pw_outages_by_feeder = pw_outages_by_feeder.select("outage_time", "relative_cluster_size", "feeder_id") pw_outages_by_tx = pw_finalized_outages.select("outage_time", "tx") pw_outages_by_tx = pw_outages_by_tx.withColumn("tx", F.explode("tx")) pw_outages_by_tx = pw_outages_by_tx.withColumn("size", F.lit(1)) pw_outages_by_tx = pw_outages_by_tx.groupBy("outage_time", "tx").agg(
def saveOnCassandra(rdd): if not rdd.isEmpty(): #js = rdd.map(lambda x: json.loads(x)) #a=rdd.collect() #print(a) #dict = rdd.collectAsMap() #rddString = str(rdd.collect()) #print(rddString) #dict = loads(rddString) #print(dict) #rdd = rddR.map(.value.toString) df = spark.read.json(rdd) #df = #df = df #print(df.select("conn.ts").collect()) #df.select(to_date(col('conn.ts')).alias('ts').cast("date")).show(10,False) #df = df.withColumn('conn.date', df['conn.ts'].cast('date')) #df1 = df.columns = ["conn"] #df.printSchema() #df2 = df.select() #print(df.select("conn.id.orig_h").alias("orig_h").collect()) #df3 = df.groupBy(["conn.proto"]).count().orderBy('count', ascending=False).cache() #df.printSchema() #df.show() #df.printSchema() #df = loads(df).decode('utf-8') #df['conn'].show() #row = Row("conn") #df = rdd.map(row).toDF() #print(dict) #print(df.columns['conn']) if 'conn' in df.columns: ''' #Proto try: df = df.na.drop(subset=["conn.proto"]) dfProto = df.select(["conn.proto"]) dfProtoAllFinal = dfProto.groupBy(["proto"]).count().show() #Proto count with timestamp dfPrTs = df.select(["conn.ts","conn.proto"]) dfProtoTs = dfPrTs.withColumn("date", from_unixtime(dfPrTs['ts'])) dfProtoHour = dfProtoTs.select('proto', date_trunc("Hour", "date").alias("hour")) dfProtoHourFinal = dfProtoHour.groupBy(["proto","hour"]).count() dfProtoDay = dfProtoTs.select('proto', date_trunc("day", "date").alias("day")) dfProtoDayFinal = dfProtoDay.groupBy(["proto","day"]).count() except: print("Erro no df de proto") pass #Service try: df = df.na.drop(subset=["conn.service"]) dfService = df.select(["conn.service"]) dfServiceTs = df.select(["conn.service", "conn.ts"]) dfServiceDate = dfServiceTs.withColumn('date', from_unixtime(dfServiceTs['ts'])) dfServiceHour = dfServiceDate.select("service", date_trunc("Hour", "date").alias("hour")) dfServiceHourFinal = dfServiceHour.groupBy(["service","hour"]).count() dfServiceDay = dfServiceDate.select('service', date_trunc("day", "date").alias("day")) dfServiceDayFinal = dfServiceDay.groupBy(["service","day"]).count() dfServiceHourFinal.show() dfServiceDayFinal.show() except: print("Erro no df de service") pass try: #Flow dfFlow = df.groupBy(["conn.`id.orig_h`","conn.`id.orig_p`","conn.`id.resp_h`", "conn.`id.resp_p`", "conn.proto"]).count() #Flow with timestamp dfFlowTs = df.select(["conn.ts","conn.`id.orig_h`","conn.`id.orig_p`","conn.`id.resp_h`", "conn.`id.resp_p`", "conn.proto"]) dfFlowDate = dfFlowTs.withColumn("date", from_Orig_h`id.orig_h`","`id.orig_p`","`id.resp_h`", "`id.resp_p`", "proto"]).count() dfFlowHourFinal.show() #Flow per day dfFlowDayWithTsDate = dfFlowDate.withColumn("day", date_trunc("day", "date")) dfFlowDay = dfFlowDayWithTsDate.drop("ts","date") dfFlowDayFinal = dfFlowDay.groupBy(["day","`id.orig_h`","`id.orig_p`","`id.resp_h`", "`id.resp_p`", "proto"]).count() dfFlowDayFinal.show() except: print("Erro no df de Flow") pass ''' #IP orig try: dfIpOrigTs = df.select(["conn.ts", "conn.`id.orig_h`"]) dfIpOrigTsRenamed = dfIpOrigTs.withColumnRenamed( "id.orig_h", "orig_h") dfIpOrigDate = dfIpOrigTsRenamed.withColumn( "date", from_unixtime(dfIpOrigTsRenamed['ts'])) dfIpOrigHour = dfIpOrigDate.select( 'orig_h', date_trunc("Hour", "date").alias("hour")) dfIpOrigHourFinal = dfIpOrigHour.groupBy(["orig_h", "hour"]).count() #dfIpOrigDay = dfIpOrigDate.select('orig_h', date_trunc("day", "date").alias("day")) #dfIpOrigDayFinal = dfIpOrigDay.groupBy(["orig_h","day"]).count() dfIpOrigHourFinal.show() #dfIpOrigDayFinal.show() except: print("Erro no df de Orig_h") pass #IP resp try: dfIpRespTs = df.select(["conn.ts", "conn.`id.resp_h`"]) dfIpRespTsRenamed = dfIpRespTs.withColumnRenamed( "id.resp_h", "resp_h") dfIpRespDate = dfIpRespTsRenamed.withColumn( "date", from_unixtime(dfIpRespTsRenamed['ts'])) dfIpRespHour = dfIpRespDate.select( 'resp_h', date_trunc("Hour", "date").alias("hour")) dfIpRespHourFinal = dfIpRespHour.groupBy(["resp_h", "hour"]).count() #dfIpRespDay = dfIpRespDate.select('orig_h', date_trunc("day", "date").alias("day")) #dfIpRespDayFinal = dfIpRespDay.groupBy(["orig_h","day"]).count() dfIpRespHourFinal.show() #dfIpRespDayFinal.show() except: print("Erro no df de resp_h") pass
pw_df = spark.read.jdbc( url = "jdbc:postgresql://timescale.ghana.powerwatch.io/powerwatch", table = query, predicates = predicates, properties={"user": args.user, "password": args.password, "driver":"org.postgresql.Driver"}) #if you have multiple saves below this prevents reloading the data every time pw_df.cache() #We should mark every row with the number of unique sensors reporting in +-5 days so we now the denominator for SAIDI/SAIFI pw_distinct_core_id = pw_df.select("time","core_id") pw_distinct_core_id = pw_distinct_core_id.groupBy(F.window("time", '10 days', '1 day')).agg(F.countDistinct("core_id"),F.array_distinct(F.collect_list("core_id")).alias("core_ids_reporting")) pw_distinct_core_id = pw_distinct_core_id.withColumn("time", F.from_unixtime((F.unix_timestamp(col("window.start")) + F.unix_timestamp(col("window.end")))/2)) pw_distinct_core_id = pw_distinct_core_id.select(col("count(DISTINCT core_id)").alias("sensors_reporting"), "time","core_ids_reporting") pw_distinct_core_id = pw_distinct_core_id.withColumn("day",F.date_trunc("day","time")) pw_distinct_core_id = pw_distinct_core_id.select("day","sensors_reporting","core_ids_reporting") pw_powered_locations = pw_df.select("time","is_powered","core_id","location_latitude","location_longitude") pw_powered_locations = pw_powered_locations.withColumn("is_powered",col("is_powered").cast(IntegerType())) pw_powered_locations = pw_powered_locations.groupBy("core_id",F.window("time",'4 minutes', '1 minute')).agg(F.avg("is_powered").alias("avg_power"), F.first("location_latitude").alias("location_latitude"), F.first("location_longitude").alias("location_longitude")) pw_powered_locations = pw_powered_locations.filter(col("avg_power") == 1) pw_powered_locations = pw_powered_locations.withColumn("time", col("window.start")) pw_powered_locations = pw_powered_locations.select("time","core_id","location_latitude","location_longitude") pw_powered_locations = pw_powered_locations.withColumn("loc_struct",F.struct("core_id","location_latitude","location_longitude")) pw_powered_locations = pw_powered_locations.groupBy("time").agg(F.collect_list("loc_struct").alias("loc_struct")) pw_powered_locations = pw_powered_locations.select(col("time").alias("minute"),"loc_struct")