Exemplo n.º 1
0
    def __call__(self, h5grp):
        from numpy import array,vectorize,unique
        from dataset import Table
        if 'flowids' not in h5grp:
            raise ValueError('expectiong \'flowids\' dataset is present in h5 group')
        fl = h5grp['flowids']
        fl.add_field('annot',-1)
        flows = {}
        annot = dict((f['idx'],f) for f in self.filters)
        #print(annot)
        for f in self.filters:
            pred =  f['predicate']
            match = fl[pred]
            if not len(match):
                continue
            if np.any(match['annot']!=-1):
                colliding = [annot[scalar(a)]['annotation'] for a in unique(match['annot']) if a in annot ]
                if 'dstIPs' not in f or 'dstPorts' not in f or not f['dstIPs'] or not f['dstPorts']:
                    print(colorize(None, boldred,red,boldyellow, yellow,red)*'## #warning#: #colliding filter# %s, %s #is ignored#: %d'%(f['annotation'],', '.join(colliding),(match['annot']!=-1).sum()))
                    match = match[match.annot==-1]
                else:
                    print(colorize(None, boldred,red,boldyellow, yellow,red,)*'## #warning#: #filters# %s, %s #are not disjoint, you will probably loose some information#: '%(f['annotation'],', '.join(colliding)),(match['annot']!=-1).sum())
            fl[pred,'annot'] = f.get('idx')
            print(colorize (None,green, boldblue,green, boldgreen) * '## #filtering# %d #flowids using filter# %s' % (len(match),f['annotation']) )
            if len(match) == 1:
                flows[scalar(match['flow'].squeeze())] = f.get('idx')
            elif len(match) > 1:
                flows.update( (i,f.get('idx')) for i in match['flow'].squeeze() )
        if 'annot' in h5grp:
            print(colorize(None, boldred,red)*'## #warning#: #annotation dataset is going to be overwritten#')
            del h5grp['annot']
        fl[...,'flow','annot'].save(h5grp['annot/flowids'])
        annots = {}
        for f in self.filters:
            if 'idx' in f:
                annots[f['idx']] = (f.get('type'),'%s (%s)' %(f.get('annotation'),f.get('fileName')))
        a = array([ list((k,)+v) for k,v in annots.iteritems()  ], dtype=str)
        annot = Table(data=a, fields=('annot','type','caption'))
        annot.save(h5grp['annot/annotations'])


        flow2annot = vectorize(lambda x: flows[x] if x in flows else -1)
        lbl = flow2annot(h5grp['y'])
        h5grp['annot/y'] = lbl

        return flows,annots,lbl
def insert_and_set_id(table: dataset.Table,
                      obj: Dict[str, Any],
                      idfield: str = 'id') -> Any:  # but typically int
    """The dataset table's insert() command returns the primary key.
    However, it doesn't store that back, and we want users to do that
    consistently."""
    pk = table.insert(obj)
    obj[idfield] = pk
    return pk
Exemplo n.º 3
0
def insert_and_set_id(table: dataset.Table,
                      obj: Dict[str, Any],
                      idfield: str = 'id') -> Any:  # but typically int
    """The dataset table's insert() command returns the primary key.
    However, it doesn't store that back, and we want users to do that
    consistently."""
    pk = table.insert(obj)
    obj[idfield] = pk
    return pk
Exemplo n.º 4
0
def get_raw(opt, callback=None, keys=(), h5=None):
    from extractor import PcapExtractor, FlowExtractor
    from os.path import isfile, basename

    if opt.in_format == 'pcap':
        extract = PcapExtractor(('time', 'src', 'sport', 'dst', 'dport',
                                 'proto', 'paylen', 'flags', 'flow'))
        tr = h5['traces'] if h5 else None
        praser = get_packets
    elif opt.in_format == 'netflow':
        extract = FlowExtractor(
            ('time', 'duration', 'src', 'sport', 'dst', 'dport', 'proto',
             'packets', 'size', 'flags', 'flows', 'flow'))
        tr = h5['netflows'] if h5 else None
        praser = get_netflow
        #def praser(fn, extract):
        #    f = open(fn,'r')
        #    try: flows = f.readlines()
        #    finally: f.close()
        #    return filter(None,map(extract,flows))
    else:
        raise NotImplementedError('in_format')

    if tr is not None:
        if not callable(callback):
            callback = lambda data, fn: data.save(tr[fn])
        if not keys:
            keys = tr.keys()
    else:
        if not callable(callback):
            raise Exception('h5 file needed')

    for fn in opt.file:
        if isfile(fn) and basename(fn) not in keys:
            print('## Extracting features from file %s...' % fn)
            pkts = praser(fn, extract)
            print('\t%d records captured' % len(pkts))
            data = Table(pkts, extract.fields)
            del pkts
            print('## Storing matrix in %s...' % opt.database)
            callback(data, basename(fn))
Exemplo n.º 5
0
    def load_quantized_data(self, df):
        tic = time.time()
        name = self.ds.raw_data.name + '_quantized'
        self.ds.quantized_data = Table(name, Source.DF, df=df)

        # Re-store to DB, ensuring numerical values are stored as floats.
        df_correct_type = df.copy()
        for attr in self.ds.numerical_attrs:
            df_correct_type.loc[df_correct_type[attr] == NULL_REPR,
                                attr] = np.nan
            df_correct_type[attr] = df_correct_type[attr].astype(float)
        df_correct_type.to_sql(name,
                               self.ds.engine.engine,
                               if_exists='replace',
                               index=False,
                               index_label=None)

        for attr in self.ds.quantized_data.get_attributes():
            self.ds.quantized_data.create_db_index(self.ds.engine, [attr])
        logging.debug('Time to load quantized dataset: %.2f secs' %
                      (time.time() - tic))
Exemplo n.º 6
0
    def __call__(self, data, flowids=None):
        """flowize and data

        Parameters
        ----------
        data : dataset.Table
            a table

        flowids : dataset.Table
            thread the flowids troughout the subsequent calls

        """
        from numpy import array, abs, vstack, squeeze
        from dataset import Table
        from sys import stdout

        flow2str2 = lambda x, f: flow2str(
            x, f, dns=self.reverse_dns, services=self.reverse_dns)

        pay = data.select((data.proto == self.protocol),
                          order='time',
                          retdset=True)
        flow = set(squeeze((flowids['flow']))) if flowids else set()
        hashes = {
        }  #if flowids is None else dict((scalar(f['flow']),tuple(f[self.fflow])) for f in flowids)
        negate = lambda x: -abs(x)
        l = 0
        dropped = 0
        droppedips = set()
        for x in pay:
            t = tuple(scalar(x[f]) for f in self.fflow)
            h = hash(t)
            tr = tuple(scalar(x[f]) for f in self.bflow)
            hr = hash(tr)
            negative = False

            if h in flow:
                pass
            elif hr in flow:
                negative = True
            elif h in hashes:
                if hashes[h] != t:
                    stdout.write(tmpl_colision % (flow2str2(t, self.fflow), h))
                    stdout.flush()
                    dropped += 1
                    continue
            elif hr in hashes:
                if hashes[hr] != tr:
                    stdout.write(tmpl_colision %
                                 (flow2str2(tr, self.bflow), hr))
                    stdout.flush()
                    dropped += 1
                    continue
                negative = True
            else:
                #print scalar(x['flags'])&18
                if self.usesyns:
                    if 'packets' in pay:  # we deal with netflow data and thus we demand SYN and don`t care about ACK
                        syn = scalar(x['flags']) & 2 == 2
                    else:  # we deal with pcap data, so SYN packet is distinguishable
                        # we don`t care about SYN+ACK
                        syn = scalar(x['flags']) & 18 == 2
                    if not syn:
                        stdout.write(
                            '\r****** no syn packet in %s (hash: %d) (flags: %d)\n'
                            %
                            (flow2str2(t, self.fflow), h, scalar(x['flags'])))
                        stdout.write(tmpl_progress %
                                     (100. * (l + dropped) / len(pay), dropped,
                                      (l + dropped)))
                        stdout.flush()
                        dropped += 1
                        droppedips.add((scalar(x['dst']), scalar(x['dport'])))
                        continue
                stdout.write('\r###### new flow %s (hash: %d)\n' %
                             (flow2str2(t, self.fflow), h))
                stdout.write(tmpl_progress %
                             (100. * (l + dropped) / len(pay), dropped,
                              (l + dropped)))
                stdout.flush()
                hashes[h] = t
            x['flow'] = h
            if negative:
                if 'paylen' in x:
                    x['paylen'] = negate  # broadcasting lambda
                elif 'size' in x:
                    x['size'] = negate  # broadcasting lambda
            l += 1
            if l % 10 == 0:
                stdout.write(tmpl_progress %
                             (100. * (l + dropped) / len(pay), dropped,
                              (l + dropped)))
                stdout.flush()
        stdout.write(tmpl_progress2 % (dropped, (l + dropped)))
        stdout.write('\n%s\n' % [(int2ip(d), pd) for d, pd in droppedips])
        stdout.flush()
        if 'paylen' in pay:
            pay = pay.select(pay.paylen != 0,
                             order='time',
                             retdset=True,
                             fields=self.fields)
        else:
            pay = pay.select(None,
                             order='time',
                             retdset=True,
                             fields=self.fields)

        if not flowids:
            return Table(data=array(tuple(
                (j, ) + k for j, k in hashes.items())),
                         fields=('flow', ) + self.fflow), pay
        else:
            if not len(hashes):
                return flowids, pay
            else:
                d = array(tuple((j, ) + k for j, k in hashes.items()))
                return Table(data=vstack((flowids.data, d)),
                             fields=('flow', ) + self.fflow), pay
Exemplo n.º 7
0
    def __call__(self, h5grp):
        from numpy import array, vectorize, unique
        from dataset import Table
        if 'flowids' not in h5grp:
            raise ValueError(
                'expectiong \'flowids\' dataset is present in h5 group')
        fl = h5grp['flowids']
        fl.add_field('annot', -1)
        flows = {}
        annot = dict((f['idx'], f) for f in self.filters)
        #print(annot)
        for f in self.filters:
            pred = f['predicate']
            match = fl[pred]
            if not len(match):
                continue
            if np.any(match['annot'] != -1):
                colliding = [
                    annot[scalar(a)]['annotation']
                    for a in unique(match['annot']) if a in annot
                ]
                if 'dstIPs' not in f or 'dstPorts' not in f or not f[
                        'dstIPs'] or not f['dstPorts']:
                    print(
                        colorize(None, boldred, red, boldyellow, yellow, red) *
                        '## #warning#: #colliding filter# %s, %s #is ignored#: %d'
                        % (f['annotation'], ', '.join(colliding),
                           (match['annot'] != -1).sum()))
                    match = match[match.annot == -1]
                else:
                    print(
                        colorize(
                            None,
                            boldred,
                            red,
                            boldyellow,
                            yellow,
                            red,
                        ) *
                        '## #warning#: #filters# %s, %s #are not disjoint, you will probably loose some information#: '
                        % (f['annotation'], ', '.join(colliding)),
                        (match['annot'] != -1).sum())
            fl[pred, 'annot'] = f.get('idx')
            print(
                colorize(None, green, boldblue, green, boldgreen) *
                '## #filtering# %d #flowids using filter# %s' %
                (len(match), f['annotation']))
            if len(match) == 1:
                flows[scalar(match['flow'].squeeze())] = f.get('idx')
            elif len(match) > 1:
                flows.update(
                    (i, f.get('idx')) for i in match['flow'].squeeze())
        if 'annot' in h5grp:
            print(
                colorize(None, boldred, red) *
                '## #warning#: #annotation dataset is going to be overwritten#'
            )
            del h5grp['annot']
        fl[..., 'flow', 'annot'].save(h5grp['annot/flowids'])
        annots = {}
        for f in self.filters:
            if 'idx' in f:
                annots[f['idx']] = (f.get('type'), '%s (%s)' %
                                    (f.get('annotation'), f.get('fileName')))
        a = array([list((k, ) + v) for k, v in annots.iteritems()], dtype=str)
        annot = Table(data=a, fields=('annot', 'type', 'caption'))
        annot.save(h5grp['annot/annotations'])

        flow2annot = vectorize(lambda x: flows[x] if x in flows else -1)
        lbl = flow2annot(h5grp['y'])
        h5grp['annot/y'] = lbl

        return flows, annots, lbl