def process_stream(self, rdd): """ Args rdd: rdd :rtype: None """ if rdd.isEmpty(): print("RDD is empty") else: df = rdd.toDF() # downsample data df2 = df.withColumn("timestamp", df.ts.cast("timestamp")) downsampled_df = df2.groupBy( 'id', window("timestamp", "1 second").alias("ds_ts")).agg( F.round(F.avg("val"), 2).alias('downsample_avg')) final_df = downsampled_df.select( "id", downsampled_df['ds_ts'].start.alias("start_ts"), "downsample_avg").orderBy('start_ts', ascending=True) # write to timescale try: connector = pgConnector.PostgresConnector( "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password") connector.write(final_df, "downsampled_table", "append") except Exception as e: print(e) pass
def process_stream(self, rdd): """ Args rdd: rdd :rtype: None """ def detect_anomaly(sensor_readings, running_avg, std_dev): """ Args: sensor_readings: List(float) running_avg: float std_dev: float :rtype: int """ anomalies = [] for x, (i, y) in zip(sensor_readings, enumerate(running_avg)): upper_limit = running_avg[i - 1] + 3 * std_dev lower_limit = running_avg[i - 1] - 3 * std_dev if (x > upper_limit) or (x < lower_limit): anomalies.append(x) return len(anomalies) if rdd.isEmpty(): print("RDD is empty") else: df = rdd.toDF().cache() w = (Window().partitionBy(col("id")).rowsBetween(-1, 1)) df = df.withColumn('rolling_average', F.avg("val").over(w)) agg_df = df.groupBy(['id']).agg( F.collect_list("val").alias("sensor_reading"), first("ts").cast('timestamp').alias("start_ts"), last("ts").cast('timestamp').alias("end_ts"), F.round(F.stddev("val"), 3).alias("std_temp"), F.collect_list("rolling_average").alias("rol_avg")) agg_df.show() anomaly_udf = udf(detect_anomaly, IntegerType()) processed_df = agg_df.withColumn( "num_anomaly", anomaly_udf("sensor_reading", "rol_avg", "std_temp")).sort(desc("num_anomaly")) final_df = processed_df.withColumn( "anomaly", F.when(F.col("num_anomaly") > 1, True).otherwise(False)) final_df = final_df.select("id", "start_ts", "end_ts", "std_temp", "num_anomaly", "anomaly") try: connector = pgConnector.PostgresConnector( "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password") connector.write(final_df, "anomaly_window_tbl", "append") except Exception as e: print(e) pass
def process_df(self, df): def detect_anomaly(ts): """ Args ts: pandas.series rtype: int """ outliers_indices = seasonal_esd( ts, hybrid=True, max_anomalies=10) return len(outliers_indices) grouped_df = df.groupBy(["id"]).agg(F.collect_list("downsample_avg").alias( "downsampled_ts"), first("start_ts").alias("start_ts"), last("end_ts").alias("end_ts")) anomaly_udf = udf(detect_anomaly, IntegerType()) processed_df = grouped_df.withColumn("num_anomaly", anomaly_udf( "downsampled_avg")).sort(desc("num_anomaly")) final_df = processed_df.select( "id", "start_ts", "end_ts", "num_anomaly") try: connector = pgConnector.PostgresConnector( "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password") connector.write(final_df, "global_anomalies_table", "append") except Exception as e: print(e) pass
def initDbConnection(): conn = pgConnector.PostgresConnector() return conn