def __call__(self, h5grp): from numpy import array,vectorize,unique from dataset import Table if 'flowids' not in h5grp: raise ValueError('expectiong \'flowids\' dataset is present in h5 group') fl = h5grp['flowids'] fl.add_field('annot',-1) flows = {} annot = dict((f['idx'],f) for f in self.filters) #print(annot) for f in self.filters: pred = f['predicate'] match = fl[pred] if not len(match): continue if np.any(match['annot']!=-1): colliding = [annot[scalar(a)]['annotation'] for a in unique(match['annot']) if a in annot ] if 'dstIPs' not in f or 'dstPorts' not in f or not f['dstIPs'] or not f['dstPorts']: print(colorize(None, boldred,red,boldyellow, yellow,red)*'## #warning#: #colliding filter# %s, %s #is ignored#: %d'%(f['annotation'],', '.join(colliding),(match['annot']!=-1).sum())) match = match[match.annot==-1] else: print(colorize(None, boldred,red,boldyellow, yellow,red,)*'## #warning#: #filters# %s, %s #are not disjoint, you will probably loose some information#: '%(f['annotation'],', '.join(colliding)),(match['annot']!=-1).sum()) fl[pred,'annot'] = f.get('idx') print(colorize (None,green, boldblue,green, boldgreen) * '## #filtering# %d #flowids using filter# %s' % (len(match),f['annotation']) ) if len(match) == 1: flows[scalar(match['flow'].squeeze())] = f.get('idx') elif len(match) > 1: flows.update( (i,f.get('idx')) for i in match['flow'].squeeze() ) if 'annot' in h5grp: print(colorize(None, boldred,red)*'## #warning#: #annotation dataset is going to be overwritten#') del h5grp['annot'] fl[...,'flow','annot'].save(h5grp['annot/flowids']) annots = {} for f in self.filters: if 'idx' in f: annots[f['idx']] = (f.get('type'),'%s (%s)' %(f.get('annotation'),f.get('fileName'))) a = array([ list((k,)+v) for k,v in annots.iteritems() ], dtype=str) annot = Table(data=a, fields=('annot','type','caption')) annot.save(h5grp['annot/annotations']) flow2annot = vectorize(lambda x: flows[x] if x in flows else -1) lbl = flow2annot(h5grp['y']) h5grp['annot/y'] = lbl return flows,annots,lbl
def insert_and_set_id(table: dataset.Table, obj: Dict[str, Any], idfield: str = 'id') -> Any: # but typically int """The dataset table's insert() command returns the primary key. However, it doesn't store that back, and we want users to do that consistently.""" pk = table.insert(obj) obj[idfield] = pk return pk
def get_raw(opt, callback=None, keys=(), h5=None): from extractor import PcapExtractor, FlowExtractor from os.path import isfile, basename if opt.in_format == 'pcap': extract = PcapExtractor(('time', 'src', 'sport', 'dst', 'dport', 'proto', 'paylen', 'flags', 'flow')) tr = h5['traces'] if h5 else None praser = get_packets elif opt.in_format == 'netflow': extract = FlowExtractor( ('time', 'duration', 'src', 'sport', 'dst', 'dport', 'proto', 'packets', 'size', 'flags', 'flows', 'flow')) tr = h5['netflows'] if h5 else None praser = get_netflow #def praser(fn, extract): # f = open(fn,'r') # try: flows = f.readlines() # finally: f.close() # return filter(None,map(extract,flows)) else: raise NotImplementedError('in_format') if tr is not None: if not callable(callback): callback = lambda data, fn: data.save(tr[fn]) if not keys: keys = tr.keys() else: if not callable(callback): raise Exception('h5 file needed') for fn in opt.file: if isfile(fn) and basename(fn) not in keys: print('## Extracting features from file %s...' % fn) pkts = praser(fn, extract) print('\t%d records captured' % len(pkts)) data = Table(pkts, extract.fields) del pkts print('## Storing matrix in %s...' % opt.database) callback(data, basename(fn))
def load_quantized_data(self, df): tic = time.time() name = self.ds.raw_data.name + '_quantized' self.ds.quantized_data = Table(name, Source.DF, df=df) # Re-store to DB, ensuring numerical values are stored as floats. df_correct_type = df.copy() for attr in self.ds.numerical_attrs: df_correct_type.loc[df_correct_type[attr] == NULL_REPR, attr] = np.nan df_correct_type[attr] = df_correct_type[attr].astype(float) df_correct_type.to_sql(name, self.ds.engine.engine, if_exists='replace', index=False, index_label=None) for attr in self.ds.quantized_data.get_attributes(): self.ds.quantized_data.create_db_index(self.ds.engine, [attr]) logging.debug('Time to load quantized dataset: %.2f secs' % (time.time() - tic))
def __call__(self, data, flowids=None): """flowize and data Parameters ---------- data : dataset.Table a table flowids : dataset.Table thread the flowids troughout the subsequent calls """ from numpy import array, abs, vstack, squeeze from dataset import Table from sys import stdout flow2str2 = lambda x, f: flow2str( x, f, dns=self.reverse_dns, services=self.reverse_dns) pay = data.select((data.proto == self.protocol), order='time', retdset=True) flow = set(squeeze((flowids['flow']))) if flowids else set() hashes = { } #if flowids is None else dict((scalar(f['flow']),tuple(f[self.fflow])) for f in flowids) negate = lambda x: -abs(x) l = 0 dropped = 0 droppedips = set() for x in pay: t = tuple(scalar(x[f]) for f in self.fflow) h = hash(t) tr = tuple(scalar(x[f]) for f in self.bflow) hr = hash(tr) negative = False if h in flow: pass elif hr in flow: negative = True elif h in hashes: if hashes[h] != t: stdout.write(tmpl_colision % (flow2str2(t, self.fflow), h)) stdout.flush() dropped += 1 continue elif hr in hashes: if hashes[hr] != tr: stdout.write(tmpl_colision % (flow2str2(tr, self.bflow), hr)) stdout.flush() dropped += 1 continue negative = True else: #print scalar(x['flags'])&18 if self.usesyns: if 'packets' in pay: # we deal with netflow data and thus we demand SYN and don`t care about ACK syn = scalar(x['flags']) & 2 == 2 else: # we deal with pcap data, so SYN packet is distinguishable # we don`t care about SYN+ACK syn = scalar(x['flags']) & 18 == 2 if not syn: stdout.write( '\r****** no syn packet in %s (hash: %d) (flags: %d)\n' % (flow2str2(t, self.fflow), h, scalar(x['flags']))) stdout.write(tmpl_progress % (100. * (l + dropped) / len(pay), dropped, (l + dropped))) stdout.flush() dropped += 1 droppedips.add((scalar(x['dst']), scalar(x['dport']))) continue stdout.write('\r###### new flow %s (hash: %d)\n' % (flow2str2(t, self.fflow), h)) stdout.write(tmpl_progress % (100. * (l + dropped) / len(pay), dropped, (l + dropped))) stdout.flush() hashes[h] = t x['flow'] = h if negative: if 'paylen' in x: x['paylen'] = negate # broadcasting lambda elif 'size' in x: x['size'] = negate # broadcasting lambda l += 1 if l % 10 == 0: stdout.write(tmpl_progress % (100. * (l + dropped) / len(pay), dropped, (l + dropped))) stdout.flush() stdout.write(tmpl_progress2 % (dropped, (l + dropped))) stdout.write('\n%s\n' % [(int2ip(d), pd) for d, pd in droppedips]) stdout.flush() if 'paylen' in pay: pay = pay.select(pay.paylen != 0, order='time', retdset=True, fields=self.fields) else: pay = pay.select(None, order='time', retdset=True, fields=self.fields) if not flowids: return Table(data=array(tuple( (j, ) + k for j, k in hashes.items())), fields=('flow', ) + self.fflow), pay else: if not len(hashes): return flowids, pay else: d = array(tuple((j, ) + k for j, k in hashes.items())) return Table(data=vstack((flowids.data, d)), fields=('flow', ) + self.fflow), pay
def __call__(self, h5grp): from numpy import array, vectorize, unique from dataset import Table if 'flowids' not in h5grp: raise ValueError( 'expectiong \'flowids\' dataset is present in h5 group') fl = h5grp['flowids'] fl.add_field('annot', -1) flows = {} annot = dict((f['idx'], f) for f in self.filters) #print(annot) for f in self.filters: pred = f['predicate'] match = fl[pred] if not len(match): continue if np.any(match['annot'] != -1): colliding = [ annot[scalar(a)]['annotation'] for a in unique(match['annot']) if a in annot ] if 'dstIPs' not in f or 'dstPorts' not in f or not f[ 'dstIPs'] or not f['dstPorts']: print( colorize(None, boldred, red, boldyellow, yellow, red) * '## #warning#: #colliding filter# %s, %s #is ignored#: %d' % (f['annotation'], ', '.join(colliding), (match['annot'] != -1).sum())) match = match[match.annot == -1] else: print( colorize( None, boldred, red, boldyellow, yellow, red, ) * '## #warning#: #filters# %s, %s #are not disjoint, you will probably loose some information#: ' % (f['annotation'], ', '.join(colliding)), (match['annot'] != -1).sum()) fl[pred, 'annot'] = f.get('idx') print( colorize(None, green, boldblue, green, boldgreen) * '## #filtering# %d #flowids using filter# %s' % (len(match), f['annotation'])) if len(match) == 1: flows[scalar(match['flow'].squeeze())] = f.get('idx') elif len(match) > 1: flows.update( (i, f.get('idx')) for i in match['flow'].squeeze()) if 'annot' in h5grp: print( colorize(None, boldred, red) * '## #warning#: #annotation dataset is going to be overwritten#' ) del h5grp['annot'] fl[..., 'flow', 'annot'].save(h5grp['annot/flowids']) annots = {} for f in self.filters: if 'idx' in f: annots[f['idx']] = (f.get('type'), '%s (%s)' % (f.get('annotation'), f.get('fileName'))) a = array([list((k, ) + v) for k, v in annots.iteritems()], dtype=str) annot = Table(data=a, fields=('annot', 'type', 'caption')) annot.save(h5grp['annot/annotations']) flow2annot = vectorize(lambda x: flows[x] if x in flows else -1) lbl = flow2annot(h5grp['y']) h5grp['annot/y'] = lbl return flows, annots, lbl