/
processor.py
59 lines (46 loc) · 1.68 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import sys
import kafka
import re
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
batch_durations = 2
input_topic = 'hits'
output_topic = 'output'
broker = 'localhost:9092'
kafka_client_cache = None
# If any one IP sends more than threshold * mean requests in any
# batch, we mark that IP as an attacker. I have found that 3 works well
# with the test data.
threshold = 3
def parse_log(log):
pattern = re.compile(r'^([0-9.]+)\s([w.-]+)\s([w.-]+)\s(\[.*\])\s"(.*)" (\d*) (\d*) "(.*)" "(.*)"')
m=pattern.match(log[1])
return m.group(1)
def time_to_minutes(row):
return (row[0]/60, row[1])
def output(ip, count):
global kafka_client_cache
if not kafka_client_cache:
kafka_client_cache = kafka.KafkaClient(broker)
producer = kafka.SimpleProducer(kafka_client_cache)
producer.send_messages(output_topic, str(ip))
def processRDD(rdd):
if (rdd.count() == 0): return
ips_with_counts = (rdd.map(parse_log)
.map(lambda x: (x, 1))
.reduceByKey(lambda a, b: a+b))
total = ips_with_counts.map(lambda (ip, count): count).reduce(lambda a, b: a+b)
mean = float(total) / ips_with_counts.count()
bad_ips = (ips_with_counts
.filter(lambda (ip, count): count > mean * threshold))
bad_ips.foreach(lambda (ip, count): output(ip, count))
def main():
sc = SparkContext(appName = "IntrusionDetector")
ssc = StreamingContext(sc, batch_durations)
kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker})
kvs.foreachRDD(processRDD)
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()