def saveAsPickleFile(rdd, time): """ Closure to save element in RDD in the DStream as Pickled data in file. This closure is called by py4j callback server. """ path = rddToFileName(prefix, suffix, time) rdd.saveAsPickleFile(path)
def dosth(time, rdd, spark): ''' # change schema https://stackoverflow.com/questions/46432789/how-to-change-pyspark-data-frame-column-data-type ''' if rdd.isEmpty(): return sqlContext = getSqlContextInstance(rdd.context) df = sqlContext.createDataFrame(rdd) df.show() df.printSchema() # df.groupBy("user").count().show() df.createOrReplaceTempView('firewall') sqlDF = spark.sql( "select server,app,action,count(*) as cnt from firewall group by server, app, action order by cnt desc" ) sqlDF.show() # output as parquet file if 1: sqlDF.write.parquet("data/firewall.parquet") # read from parquet if 0: pqtDF = spark.read.parquet("data/firewall.parquet") pqtDF.createOrReplaceTempView("pqt_firewall") pqtv2DF = spark.sql("SELECT * FROM pqt_firewall") pqtv2DF.show() # output as json if 1: enriched_data_path = 'data/firewall_df.json' path = rddToFileName(enriched_data_path, None, time) sqlDF.write.json(path, mode='error')
def saveAsTextFile(t, rdd): path = rddToFileName(prefix, suffix, t) try: rdd.saveAsTextFile(path) except Py4JJavaError as e: # after recovered from checkpointing, the foreachRDD may # be called twice if 'FileAlreadyExistsException' not in str(e): raise
def saveAsTextFile(t: Optional[datetime], rdd: RDD[T]) -> None: path = rddToFileName(prefix, suffix, t) try: rdd.saveAsTextFile(path) except Py4JJavaError as e: # after recovered from checkpointing, the foreachRDD may # be called twice if "FileAlreadyExistsException" not in str(e): raise
def process_messages(time, rdd, ssc, model, enriched_data_path, zookeeper_hosts, kafka_alert_topic, kafka_enriched_data_topic, max_batches): global BATCH_COUNTER if rdd.isEmpty(): return sqlContext = getSqlContextInstance(rdd.context) df = sqlContext.createDataFrame(rdd) # Enrich data to build preprocesed dataframe. df = enrich_data(df) # Perist enriched data to storage (direct from Spark to HDFS). # This will create a file per partition per batch. if enriched_data_path: path = rddToFileName(enriched_data_path, None, time) df.write.json(path, mode='error') # Send all enriched data to a Kafka topic. # Note that each worker sends its own partitions directly to Kafka. The driver is not in the data path. # This can be consumed by Flume to write to HDFS allowing multiple batches to be appended to the same file. if kafka_enriched_data_topic: df.foreachPartition(lambda d: write_partition_to_kafka(d, zookeeper_hosts=zookeeper_hosts, kafka_topic=kafka_enriched_data_topic)) # Build feature vector. df = build_features_vector(df) # Show 10 records of the dataframe. # df.select(['duration','src_bytes','dst_bytes','features','label']).show(10) # Predict anomalies with model. # We must use RDDs, not dataframes, because we can't save/load the pipelined ML model using PySpark yet. if model: features_rdd = extract_features(df) predictions_rdd = model.predict(features_rdd) features_and_predictions_rdd = df.rdd.zip(predictions_rdd) anomalies_rdd = features_and_predictions_rdd.filter(lambda x: x[1] <= 0).map(lambda x: x[0]) anomalies = anomalies_rdd.collect() print('Predicted %d anomalies' % len(anomalies)) # For demo purposes, only alert on the first 5 anomalies. anomalies = anomalies[:5] # Send anomalies to Kafka. # Note that since we expect very few anomalies, the records are brought into the driver which # then sends to Kafka. if anomalies: client = KafkaClient(zookeeper_hosts=zookeeper_hosts) topic = client.topics[kafka_alert_topic] with topic.get_producer(delivery_reports=False) as producer: for row in anomalies: alert = row.asDict() del alert['features'] # remove features vector because we can't serialize it to JSON alert['alert_text'] = 'predicted to be an anomaly' msg = json.dumps(alert) producer.produce(msg) print('Sent alert: %s' % msg) # Stop after specified number of batches. This is used for development only. BATCH_COUNTER += 1 if max_batches > 0 and BATCH_COUNTER >= max_batches: print('Reached maximum number of batches.') ssc.stop(True, False)
def process_messages(time, rdd, ssc, model, enriched_data_path, zookeeper_hosts, kafka_alert_topic, kafka_enriched_data_topic, max_batches): global BATCH_COUNTER if rdd.isEmpty(): return sqlContext = getSqlContextInstance(rdd.context) df = sqlContext.createDataFrame(rdd) # Enrich data to build preprocesed dataframe. df = enrich_data(df) # Perist enriched data to storage (direct from Spark to HDFS). # This will create a file per partition per batch. if enriched_data_path: path = rddToFileName(enriched_data_path, None, time) df.write.json(path, mode='error') # Send all enriched data to a Kafka topic. # Note that each worker sends its own partitions directly to Kafka. The driver is not in the data path. # This can be consumed by Flume to write to HDFS allowing multiple batches to be appended to the same file. if kafka_enriched_data_topic: df.foreachPartition(lambda d: write_partition_to_kafka( d, zookeeper_hosts=zookeeper_hosts, kafka_topic=kafka_enriched_data_topic)) # Build feature vector. df = build_features_vector(df) # Show 10 records of the dataframe. # df.select(['duration','src_bytes','dst_bytes','features','label']).show(10) # Predict anomalies with model. # We must use RDDs, not dataframes, because we can't save/load the pipelined ML model using PySpark yet. if model: features_rdd = extract_features(df) predictions_rdd = model.predict(features_rdd) features_and_predictions_rdd = df.rdd.zip(predictions_rdd) anomalies_rdd = features_and_predictions_rdd.filter( lambda x: x[1] <= 0).map(lambda x: x[0]) anomalies = anomalies_rdd.collect() print('Predicted %d anomalies' % len(anomalies)) # For demo purposes, only alert on the first 5 anomalies. anomalies = anomalies[:5] # Send anomalies to Kafka. # Note that since we expect very few anomalies, the records are brought into the driver which # then sends to Kafka. if anomalies: client = KafkaClient(zookeeper_hosts=zookeeper_hosts) topic = client.topics[kafka_alert_topic] with topic.get_producer(delivery_reports=False) as producer: for row in anomalies: alert = row.asDict() del alert[ 'features'] # remove features vector because we can't serialize it to JSON alert['alert_text'] = 'predicted to be an anomaly' msg = json.dumps(alert) producer.produce(msg) print('Sent alert: %s' % msg) # Stop after specified number of batches. This is used for development only. BATCH_COUNTER += 1 if max_batches > 0 and BATCH_COUNTER >= max_batches: print('Reached maximum number of batches.') ssc.stop(True, False)