def __init__(self, file_path, limit, skip): self.path = file_path self.limit = limit self.curPacketIndx = skip ### Prep Feature extractor (AfterImage) ### maxHost = 100000000000 maxSess = 100000000000 self.nstat = ns.netStat(np.nan, maxHost, maxSess)
def test_run(self): maxHost = 50 maxSess = 50 nstat = ns.netStat(maxHost, maxSess) with open('D:\datasets\\SYN.tsv', 'rt', encoding="utf8") as tsvin: tsvin = csv.reader(tsvin, delimiter='\t') count = 0 timestats = [] for row in tsvin: count = count + 1 if count % 10000 == 0: print(count) if count > 1: if count == 10000: print((srcMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp))) #print(stats) print('Mean packet processing time: ' + str(np.mean(timestats))) break else: timestamp = row[0] framelen = row[1] srcIP = row[5] + row[ 50] # ipv4 or ipv6 address: ipv4 or ipv6 (one will be '') dstIP = row[6] + row[51] # ipv4 or ipv6 address srcMAC = row[2] srcproto = row[14] + row[ 32] # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" dstproto = row[15] + row[33] # UDP or TCP port if srcproto == '': # it's a L2/L1 level protocol if row[37] != '': # is ARP srcproto = 'arp' dstproto = 'arp' srcIP = row[2] # src MAC dstIP = row[3] # dst MAC elif row[36] != '': # is IGMP srcproto = 'igmp' dstproto = 'igmp' elif row[34] != '': # is ICMP srcproto = 'icmp' dstproto = 'icmp' elif srcIP + srcproto + dstIP + dstproto == '': # some other protocol srcIP = row[2] # src MAC dstIP = row[3] # dst MAC tic = time.time() stats = nstat.updateGetStats(srcMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp)) toc = time.time() - tic timestats.append(toc)
def __init__(self, file_path, limit=np.inf): self.path = file_path self.limit = limit self.parse_type = None #unknown self.curPacketIndx = 0 self.tsvin = None #used for parsing TSV file self.scapyin = None #used for parsing pcap with scapy ### Prep pcap ## self.__prep__() ### Prep Feature extractor (AfterImage) ### maxHost = 100000000000 maxSess = 100000000000 self.nstat = ns.netStat(np.nan, maxHost, maxSess)
def test_HostLimit(self): maxHost = 10 maxSess = 10 nstat = ns.netStat(maxHost, maxSess) t = 0 hostCount = 0 try: for src_h in range(0, maxHost + 1): for sid in range(0, maxSess): nstat.updateGetStats('MAC', '10.0.0.' + str(src_h), str(sid), '10.0.1.1', str(sid), 1, t) t = t + 0.001 hostCount = hostCount + 1 except LookupError: self.assertEquals( hostCount - maxHost, 0 ) #if Fails, this means that LookupError was not throw correctly (more or less sessions were allowed) return self.assertLessEqual( maxHost, hostCount) #LookupError wasn't raised but it should have
def test_MACIPLimit(self): maxHost = 10 maxSess = 10 nstat = ns.netStat(maxHost, maxSess) t = 0 MACIPCount = 0 try: for src_h in range(0, maxHost): for MAC in range(0, 3 + 1): nstat.updateGetStats(str(MAC), '10.0.0.' + str(src_h), str(1), '10.0.1.1', str(2), 1, t) t = t + 0.001 MACIPCount = MACIPCount + 1 except LookupError: self.assertEquals( MACIPCount - maxHost * 3, 1 ) #if Fails, this means that LookupError was not thrown correctly (more or less sessions were allowed) return self.assertLessEqual( maxHost * 3, MACIPCount) #LookupError wasn't raised but it should have
def test_purgeOldData(self): maxHost = 255 maxSess = 80000 nstat = ns.netStat(maxHost, maxSess) t = 0 print("Adding Before Sessions") for src_h in range(0, 5): for dst_h in range(0, 10): for sid in range(0, 5): ssid = sid nstat.updateGetStats('MAC', '10.0.0.' + str(src_h), str(sid), '10.0.0.' + str(dst_h), str(ssid), 1, t) print("Adding After Sessions") t = 1000000000 for src_h in range(0, 5): for dst_h in range(0, 5): for sid in range(0, 2): ssid = sid nstat.updateGetStats('MAC', '10.0.0.' + str(src_h), str(sid), '10.0.0.' + str(dst_h), str(ssid), 1, t) print("Begin Purge") before = len(nstat.HT.HT) memb4 = sys.getsizeof(nstat.HT.HT) + sys.getsizeof( nstat.Rec_MAC_Host) + sys.getsizeof( nstat.Rec_Hosts) + sys.getsizeof(nstat.Rec_Sessions) tic = time.time() nstat.purgeOldRecords(t) toc = time.time() - tic gc.collect() memAft = sys.getsizeof(nstat.HT.HT) + sys.getsizeof( nstat.Rec_MAC_Host) + sys.getsizeof( nstat.Rec_Hosts) + sys.getsizeof(nstat.Rec_Sessions) after = len(nstat.HT.HT) print('Purge: Before ' + str(before) + ' After ' + str(after) + 'Time: ' + str(toc) + ' seconds.\nMem Before: ' + str(memb4 / (1024 * 1024)) + 'MB, Mem After: ' + str(memAft / (1024 * 1024)) + ' MB') self.assertEqual(before - after, 400) #there should be 4 less entries
def test_run_affectOfOneSided_winstats( self): #should have no affect on results maxHost = 50 maxSess = 50 nstat = ns.netStat(maxHost, maxSess) with open('D:\datasets\\SYN.tsv', 'rt', encoding="utf8") as tsvin: tsvin = csv.reader(tsvin, delimiter='\t') count = 0 timestats = [] for row in tsvin: count = count + 1 if count % 10000 == 0: print(count) if count > 1: if count == 100000: #print((srcMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp))) #print(stats1) print('Mean packet processing time: ' + str(np.mean(timestats))) break else: timestamp = row[0] framelen = row[1] srcIP = row[5] + row[ 50] # ipv4 or ipv6 address: ipv4 or ipv6 (one will be '') dstIP = row[6] + row[51] # ipv4 or ipv6 address srcMAC = row[2] srcproto = row[14] + row[ 32] # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" dstproto = row[15] + row[33] # UDP or TCP port if srcproto == '': # it's a L2/L1 level protocol if row[37] != '': # is ARP srcproto = 'arp' dstproto = 'arp' srcIP = row[2] # src MAC dstIP = row[3] # dst MAC elif row[36] != '': # is IGMP srcproto = 'igmp' dstproto = 'igmp' elif row[34] != '': # is ICMP srcproto = 'icmp' dstproto = 'icmp' elif srcIP + srcproto + dstIP + dstproto == '': # some other protocol srcIP = row[2] # src MAC dstIP = row[3] # dst MAC tic = time.time() stats1 = nstat.updateGetStats(srcMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp)) toc = time.time() - tic timestats.append(toc) #reset, with purges nstat = ns.netStat(maxHost, maxSess) with open('D:\datasets\\SYN.tsv', 'rt', encoding="utf8") as tsvin: tsvin = csv.reader(tsvin, delimiter='\t') count = 0 timestats = [] for row in tsvin: count = count + 1 if count % 10000 == 0: print(count) if count > 1: if count == 100000: print((srcMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp))) print(stats1[0:10]) print(stats2[0:10]) print('Relative error') print( np.absolute(np.array(stats1) - np.array(stats2)) / np.array(stats1)) print('percent error:') print((np.array(stats2) / np.array(stats1) - 1) * 100) print('Mean packet processing time: ' + str(np.mean(timestats))) break else: timestamp = row[0] framelen = row[1] srcIP = row[5] + row[ 50] # ipv4 or ipv6 address: ipv4 or ipv6 (one will be '') dstIP = row[6] + row[51] # ipv4 or ipv6 address srcMAC = row[2] srcproto = row[14] + row[ 32] # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" dstproto = row[15] + row[33] # UDP or TCP port if srcproto == '': # it's a L2/L1 level protocol if row[37] != '': # is ARP srcproto = 'arp' dstproto = 'arp' srcIP = row[2] # src MAC dstIP = row[3] # dst MAC elif row[36] != '': # is IGMP srcproto = 'igmp' dstproto = 'igmp' elif row[34] != '': # is ICMP srcproto = 'icmp' dstproto = 'icmp' elif srcIP + srcproto + dstIP + dstproto == '': # some other protocol srcIP = row[2] # src MAC dstIP = row[3] # dst MAC if count % 10000 == 0: nstat.purgeOldRecords(float(timestamp)) tic = time.time() stats2 = nstat.updateGetStats(srcMAC, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp)) toc = time.time() - tic timestats.append(toc) self.assertGreaterEqual( np.mean((np.array(stats2) / np.array(stats1) - 1) * 100), 0.000001) #average percent error with purges
def RTSP_videoJak_Dataset_Gen(): ht = ns.netStat() with io.open('/media/root/66fff5fd-de78-45b0-880a-d2e8104242b5/datasets/RTSP_record_parsed.tsv','rt',encoding="utf8") as tsvin, io.open('/media/root/66fff5fd-de78-45b0-880a-d2e8104242b5/datasets/videoJak_full.csv', 'wt', newline='') as csvout: tsvin = csv.reader(tsvin, delimiter='\t') count = 0 for row in tsvin: count= count + 1 if count%10000==0: print(count) if count == 1: #print (str(len(row))+str (" num of original headers")) #csvout.writerow(str(row) + str(ht.getNetStatHeaders())+["Class"]) for f in row: csvout.write(unicode(str(f)+",","utf-8")) for f in ht.getNetStatHeaders(): csvout.write(unicode(str(f)+",","utf-8")) csvout.write(unicode("Class","utf-8")) csvout.write(unicode("\n","utf-8")) #print (str(len(ht.getNetStatHeaders()))+str(" are the stats headers")) #csvout = csv.writer(csvout) """ counter = 0 for x in row: print(str(x) + ", " + str(counter)) counter += 1 """ else: #print (str(len(row))+str(" num of original features")) try: timestamp = row[53] framelen = row[54] srcIP = row[15] #ipv4 or ipv6 address: ipv4 or ipv6 (one will be '') dstIP = row[16] #ipv4 or ipv6 address srcproto = row[17] + row[33] #UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" dstproto = row[18] + row[34] #UDP or TCP port if srcproto == '': #it's a L2/L1 level protocol if row[48] != '': #is ARP srcproto = 'arp' dstproto = 'arp' srcIP = row[49] #src MAC dstIP = row[51] #dst MAC elif row[40] != '': #is IGMP srcproto = 'igmp' dstproto = 'igmp' elif row[37] != '': #is ICMP srcproto = 'icmp' dstproto = 'icmp' elif srcIP+srcproto+dstIP+dstproto == '': #some other protocol srcIP = row[1] # src MAC dstIP = row[0] # dst MAC stats = ht.updateGetStats(srcIP,srcIP,srcproto,dstIP,dstproto,int(framelen),float(timestamp)) #print (str(len(stats))+ str(" num of stat features")) Label = "0" if float(timestamp)>=2874.460763: #1750648 frame.no print("reached") if row[15] != '': #row[5] is srcIPv4 if row[15].split(".")[3] == "13": Label = "1" #replace missing values with -1 for index, item in enumerate(row): if item == '': row[index] = '-1' m=map(str,row) m2=map(str,list(stats)) j2=', '.join(m2) j=', '.join(m) j+=","+j2 j+=","+Label #csvout.writerow(row + list(stats) + [Label],"utf-8") csvout.write(unicode(str(j),"utf-8")) csvout.write(unicode("\n","utf-8")) except: count+=1 print("observation "+str(count)+" was rejected") continue
def physicalMIM_Dataset_Gen(): ht = ns.netStat() with io.open('/media/root/66fff5fd-de78-45b0-880a-d2e8104242b5/datasets/piddle_record_parsed.tsv','rt',encoding="utf8") as tsvin, io.open('/media/root/66fff5fd-de78-45b0-880a-d2e8104242b5/datasets/piddle_FULL.csv', 'wt', newline='') as csvout: tsvin = csv.reader(tsvin, delimiter='\t') count = 0 #csvout = csv.writer(csvout) for row in tsvin: count = count + 1 if count % 10000 == 0: print(count) if count == 1: directional = True for f in row: csvout.write(unicode(str(f)+",","utf-8")) for f in ht.getNetStatHeaders(): csvout.write(unicode(str(f)+",","utf-8")) csvout.write(unicode("Class","utf-8")) csvout.write(unicode("\n","utf-8")) #csvout.writerow(row + ht.getNetStatHeaders(directional) + ["Class"]) else: timestamp = row[53] framelen = row[54] srcIP = row[15] # ipv4 or ipv6 address: ipv4 or ipv6 (one will be '') dstIP = row[16] # ipv4 or ipv6 address srcproto = row[17] + row[ 33] # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" dstproto = row[18] + row[34] # UDP or TCP port if srcproto == '': # it's a L2/L1 level protocol if row[48] != '': # is ARP srcproto = 'arp' dstproto = 'arp' srcIP = row[49] # src MAC dstIP = row[51] # dst MAC elif row[40] != '': # is IGMP srcproto = 'igmp' dstproto = 'igmp' elif row[37] != '': # is ICMP srcproto = 'icmp' dstproto = 'icmp' elif srcIP + srcproto + dstIP + dstproto == '': # some other protocol srcIP = row[1] # src MAC if srcIP=='': srcIP=row[49] dstIP = row[0] # dst MAC if dstIP=='': dstIP=row[51] srcproto="other" dstproto="other" elif srcIP+dstIP=='': srcIP = row[1] dstIP = row[0] if row[1] == '00:a0:de:f1:88:6e': # the source is the yamaha gateway direction = "in" else: direction = "out" try: stats = ht.updateGetStats(direction,srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp)) except: count+=1 print ("skipped netstat") continue Label = "0" if count >= 5179941: Label = "1" # replace missing values with -1 for index, item in enumerate(row): if item == '': row[index] = '-1' m = map(str, row) m2 = map(str, list(stats)) j2 = ', '.join(m2) j = ', '.join(m) j += "," + j2 j += "," + Label # csvout.writerow(row + list(stats) + [Label],"utf-8") csvout.write(unicode(str(j), "utf-8")) csvout.write(unicode("\n", "utf-8"))
def CTU52818_Desaset_Gen_V2_400(): ht = ns.netStat(50000, 50000) with io.open('E:/thesis_data/datasets/ctu52818_400_sortedTS.txt', 'rt', encoding="utf8") as tsvin, io.open( 'E:/thesis_data/datasets/ctu52818_400_full.csv', 'wt', newline='') as csvout: tsvin = csv.reader(tsvin, delimiter=',') count = 0 rowListByTS = [] for row in tsvin: count = count + 1 if count % 10000 == 0: print(count) if count == 1: # print (str(len(row))+str (" num of original headers")) # csvout.writerow(str(row) + str(ht.getNetStatHeaders())+["Class"]) for f in row: csvout.write(unicode(str(f) + ",", "utf-8")) for f in ht.getNetStatHeaders(): csvout.write(unicode(str(f) + ",", "utf-8")) csvout.write(unicode("Class", "utf-8")) csvout.write(unicode("\n", "utf-8")) else: # print (str(len(row))+str(" num of original features")) try: if count == 2: dateAr = row[0].split(' ')[1].split(':') for m in range(len(dateAr)): dateAr[m] = float(dateAr[m]) startTS = dateAr[0] * 3600 * 1000 + dateAr[ 1] * 60 * 1000 + dateAr[2] * 1000 dateAr = row[0].split(' ')[1].split(':') for m in range(len(dateAr)): dateAr[m] = float(dateAr[m]) timestamp = dateAr[0] * 3600 * 1000 + dateAr[ 1] * 60 * 1000 + dateAr[ 2] * 1000 - startTS # change format framelen = row[9] srcIP = row[3].split( ':' )[0] # ipv4 or ipv6 address: ipv4 or ipv6 (one will be '') dstIP = row[5].split(':')[0] # ipv4 or ipv6 address if row[3].__contains__(':') == True: srcproto = row[3].split( ':' )[1] # UDP or TCP port: the concatenation of the two port strings will will results in an OR "[tcp|udp]" else: srcproto = 'NoProto' if row[3].__contains__(':') == True: dstproto = row[5].split(':')[1] # UDP or TCP port else: dstproto = 'NoProto' if srcproto == '': # it's a L2/L1 level protocol if row[2] == "ARP": # is ARP srcproto = 'ARP' dstproto = 'ARP' elif row[2] == 'IGMP': # is IGMP srcproto = 'IGMP' dstproto = 'IGMP' elif row[2] == 'ICMP': # is ICMP srcproto = 'ICMP' dstproto = 'ICMP' elif srcIP + srcproto + dstIP + dstproto == '': # some other protocol srcIP = row[3].split(':')[0] # src MAC dstIP = row[5].split(':')[0] # dst MAC stats = ht.updateGetStats(srcIP, srcIP, srcproto, dstIP, dstproto, int(framelen), float(timestamp)) # print (str(len(stats))+ str(" num of stat features")) Label = "0" if row[11].find("Botnet") != -1: # 1750648 frame.no # print("reached") Label = "1" # replace missing values with -1 for index, item in enumerate(row): if item == '': row[index] = '-1' m = map(str, row) m2 = map(str, list(stats)) j2 = ', '.join(m2) j = ', '.join(m) j += "," + j2 j += "," + Label # csvout.writerow(row + list(stats) + [Label],"utf-8") csvout.write(unicode(str(j), "utf-8")) csvout.write(unicode("\n", "utf-8")) except Exception as ex: print(ex.message) count += 1 print("observation " + str(count) + " was rejected") continue
def sortCTU52818File(): with io.open('E:/thesis_data/datasets/ctu52818_400.txt', 'rt', encoding="utf8") as tsvinFullLine: with io.open('E:/thesis_data/datasets/ctu52818_400.txt', 'rt', encoding="utf8") as tsvin: with io.open('E:/thesis_data/datasets/ctu52818_400_sortedTS.csv', 'wt', encoding="utf8") as tswrite: tsvin = csv.reader(tsvin, delimiter='\t') count = 0 ht = ns.netStat(50000, 50000) fullLineOrig = tsvinFullLine.readline() rowListByTS = [] for row in tsvin: count = count + 1 if count % 10000 == 0: print(count) if count == 1: # print (str(len(row))+str (" num of original headers")) # csvout.writerow(str(row) + str(ht.getNetStatHeaders())+["Class"]) editedLine = '' for f in row: editedLine += str(f) + "," #for f in ht.getNetStatHeaders(): #editedLine += str(f) + "," #editedLine+="Class" tswrite.write(unicode(str(editedLine) + "\n", "utf-8")) continue if count == 2: dateAr = row[0].split(' ')[1].split(':') for m in range(len(dateAr)): dateAr[m] = float(dateAr[m]) startTS = dateAr[0] * 3600 * 1000 + dateAr[ 1] * 60 * 1000 + dateAr[2] * 1000 # change format timestamp = 0 else: dateAr = row[0].split(' ')[1].split(':') for m in range(len(dateAr)): dateAr[m] = float(dateAr[m]) timestamp = dateAr[0] * 3600 * 1000 + dateAr[ 1] * 60 * 1000 + dateAr[2] * 1000 # change format #timestamp -=startTS fullLineOrig = tsvinFullLine.readline() # sort the file fullLine = '' for r in range(len(row)): if row[r] == '': continue if r < len(row) - 1: fullLine += str(row[r]) + "," else: fullLine += str(row[r]) rowListByTS.append((timestamp, fullLineOrig)) #if row[3].__contains__(':')==False or row[5].__contains__(':')==False: # print("fs") if fullLine.__contains__(',,') == True: print("here") rowListByTS.sort(key=lambda tup: tup[0]) for row in rowListByTS: tswrite.write(unicode(str(row[1]) + "\n", "utf-8")) print(rowListByTS[:30]) print("finished sort")