def annotate(opt, h5=None): """ Crappy feature allowing to create annotation file """ from labeling import Labeler labeler = Labeler(opt) labeler.prepare() if 'samples' not in h5: return samples = h5['samples'] opt.srate opt.window for k, sampl in samples.iteritems(): if '.srate' not in sampl or '.wndsize' not in sampl: continue srate = scalar(sampl['.srate']) wndsize = scalar(sampl['.wndsize']) if opt.srate and srate not in opt.srate or opt.window and wndsize not in opt.window: continue print( colorize(None, boldblue, green) * '\n\n## #labeling sampleset# %s' % k) labeler(sampl)
def annotate(opt, h5=None): """ Crappy feature allowing to create annotation file """ from labeling import Labeler labeler = Labeler(opt) labeler.prepare() if 'samples' not in h5: return samples = h5['samples'] opt.srate opt.window for k,sampl in samples.iteritems(): if '.srate' not in sampl or '.wndsize' not in sampl : continue srate = scalar(sampl['.srate']) wndsize = scalar(sampl['.wndsize']) if opt.srate and srate not in opt.srate or opt.window and wndsize not in opt.window: continue print(colorize(None,boldblue,green ) * '\n\n## #labeling sampleset# %s' % k) labeler(sampl)
def partition(data, device, train_size=0.8): """Static partition method. Performs partition by "plucking" (1 - train_size) * n messages out of the provided data set, and places them in a new test set. Args: data (list): The original formatted BGP data. device (str): The device on which the tensors should be created/stored. train_size (float): The proportion of the data to use as a training set. Returns: A tuple containing the partitioned training input and target sets as tensors. """ # Start with a copy, will be training train = copy.deepcopy(data) test = [] # Get distribution of message indices, keep ordering test_len = int((1 - train_size) * len(data)) test_indices = sorted(random.sample(range(len(data)), test_len), reverse=True) # For each index, remove from train and append to test for i in test_indices: test.append(train.pop(i)) # Need to reverse test now test.reverse() # Now label each set individually (performed in place) Labeler(train) Labeler(test) # Rescale data as well train = DataRescaler(train).scaled_data test = DataRescaler(test).scaled_data # Convert to tensors # Inputs Xtrain = torch.tensor( [[s.get('time')] + list(s.get('composite').values()) for s in train], dtype=torch.double).to(device) Xtest = torch.tensor( [[s.get('time')] + list(s.get('composite').values()) for s in test], dtype=torch.double).to(device) # Targets Ttrain = torch.tensor([[s.get('distinct')] for s in train], dtype=torch.long).to(device) Ttest = torch.tensor([[s.get('distinct')] for s in test], dtype=torch.long).to(device) return (Xtrain, Ttrain, Xtest, Ttest)
def test_labeling(self): l = Labeler(1) self.assertEqual(l.get_label(2.0, 5.0), 'a') self.assertEqual(l.get_label(5.0, 2.0), 'a') self.assertEqual(l.get_label(6.0, 5.0), 'b') self.assertEqual(l.get_label(6.0, 7.0), 'c') self.assertEqual(l.get_label(4.0, 7.0), 'd') l = Labeler(3) self.assertEqual(l.get_label(1.1, 2.0), 'aad') self.assertEqual(l.get_label(2.0, 2.0), 'aac') self.assertEqual(l.get_label(2.6, 2.6), 'aca')
def get_models(opt, h5=None): from models import evaluate, plot_roc, fapply, Mahalanobis, Momentum, FreqThresh, FreqBands from sklearn.preprocessing import Scaler from sklearn.decomposition import PCA from sklearn.mixture import GMM, DPGMM from sklearn.manifold import LocallyLinearEmbedding, Isomap from labeling import Labeler import re if not h5: h5 = H5Node(opt) samples = h5['samples'] print( colorize(boldblue, green) * '#datasets found in database# %s:' % opt.database) datasets = [] i = 0 for k, sampl in samples.iteritems(): if '.srate' not in sampl or '.wndsize' not in sampl: continue srate = scalar(sampl['.srate']) wndsize = scalar(sampl['.wndsize']) if opt.srate and srate not in opt.srate or opt.window and wndsize not in opt.window: continue if opt.sample and not re.findall(opt.sample, k): continue print( colorize(boldyellow, green) * '[%d] %s : (srate=%f, wndsize=%d)' % (i, k, srate, wndsize)) datasets.append((i, (k, sampl, srate, wndsize))) i += 1 datasets = dict(datasets) if len(datasets) > 1: selected = [] while not selected: s = raw_input('datasets to use:') selected = [datasets[int(i.strip())] for i in s.split(',')] else: selected = datasets.values() steps = { #'Scaler': fapply( Scaler ), 'Bands': fapply(FreqBands, 2, 5, 10), #'BandsLg': fapply( FreqBands, 2,5,10, log_scale=True ), 'Threshold': fapply(FreqThresh, 0), 'Momentum': fapply(Momentum, 'vks'), #'GMM' : fapply( GMM, 1, 5, covariance_type='diag', n_iter=40 ), 'DPGMM': fapply(DPGMM, covariance_type='diag', n_iter=40), 'Mahal': fapply(Mahalanobis, False), 'PCA': fapply(PCA, 1, 3), 'PCA2': fapply(PCA), #'PCAw': fapply( PCA, 3, 10 , whiten=True ) } if not opt.computations: opt.computations = [ #('Bands', 'DPGMM'), ('Bands', 'Mahal'), #('BandsLg', 'DPGMM'), #('Threshold','DPGMM'), #('Threshold', 'Mahal'), ('Threshold', 'Momentum', 'Mahal'), #('Threshold','MomentumMVKS', 'DPGMM' ), ('Threshold', 'PCA', 'Mahal'), #('Threshold', 'PCA', 'DPGMM' ), #('Threshold', 'PCAw', 'DPGMM' ) ] for k, sampl, srate, wndsize in selected: print('## processing %s' % k) if not 'annot' in sampl: labeler = Labeler(opt) labeler.prepare() labeler(sampl) fit, binarize = None, None #sampl, = [ h5[s] for s in ('/samples/data_psd_0.003300_200_simulated/', '/samples/data_psd_100.000000_200_simulated/') if s in h5 ] splitToInts = lambda x: [ int(i) for i in (m.strip() for m in x.split(',') if isString(m)) if i.isdigit() ] model = splitToInts(opt.model) if opt.model is not None else None legit = splitToInts(opt.legit) if opt.legit is not None else None malicious = splitToInts( opt.malicious) if opt.malicious is not None else None m, ((fit, binarize, classes), res) = evaluate(opt, None, sampl, steps=steps, model=model, legit=legit, malicious=malicious) plot_roc(res, 'ROC curves') if opt.tex: f = open(opt.tex, 'a') try: f.write('\n') f.write(r''' \begin{table}[h] \begin{center} \begin{tabular}{c|cc} Method & $\overline{\mu_{auc}}$ & $\overline{\sigma_{auc}}$ \\ \hline %s \end{tabular} \end{center} \caption{Mean and standard deviation of the area under ROC curve.} \end{table} ''' % '\\\\ \hline\n'.join( ('%s & %.3f & %.3f' % (name.replace('_', '\_'), np.mean(auc), np.std(auc))) for name, auc, _ in res)) f.write('\n') finally: f.close() return m, ((fit, binarize, classes), res)
def get_models(opt, h5= None): from models import evaluate,plot_roc,fapply,Mahalanobis,Momentum,FreqThresh,FreqBands from sklearn.preprocessing import Scaler from sklearn.decomposition import PCA from sklearn.mixture import GMM,DPGMM from sklearn.manifold import LocallyLinearEmbedding,Isomap from labeling import Labeler import re if not h5: h5 = H5Node(opt) samples = h5['samples'] print(colorize(boldblue,green) * '#datasets found in database# %s:' %opt.database) datasets = [] i = 0 for k,sampl in samples.iteritems(): if '.srate' not in sampl or '.wndsize' not in sampl : continue srate = scalar(sampl['.srate']) wndsize = scalar(sampl['.wndsize']) if opt.srate and srate not in opt.srate or opt.window and wndsize not in opt.window: continue if opt.sample and not re.findall(opt.sample, k): continue print(colorize(boldyellow,green) * '[%d] %s : (srate=%f, wndsize=%d)'%(i,k,srate,wndsize)) datasets.append((i,(k,sampl,srate,wndsize))) i+=1 datasets = dict(datasets) if len(datasets)>1: selected = [] while not selected: s = raw_input('datasets to use:') selected = [datasets[int(i.strip())] for i in s.split(',')] else: selected = datasets.values() steps = { #'Scaler': fapply( Scaler ), 'Bands': fapply( FreqBands, 2,5,10 ), #'BandsLg': fapply( FreqBands, 2,5,10, log_scale=True ), 'Threshold': fapply( FreqThresh, 0 ), 'Momentum': fapply( Momentum, 'vks'), #'GMM' : fapply( GMM, 1, 5, covariance_type='diag', n_iter=40 ), 'DPGMM' : fapply( DPGMM, covariance_type='diag', n_iter=40 ), 'Mahal': fapply( Mahalanobis, False ), 'PCA': fapply( PCA, 1, 3 ), 'PCA2': fapply( PCA ), #'PCAw': fapply( PCA, 3, 10 , whiten=True ) } if not opt.computations : opt.computations = [ #('Bands', 'DPGMM'), ('Bands', 'Mahal'), #('BandsLg', 'DPGMM'), #('Threshold','DPGMM'), #('Threshold', 'Mahal'), ('Threshold','Momentum', 'Mahal' ), #('Threshold','MomentumMVKS', 'DPGMM' ), ('Threshold', 'PCA', 'Mahal' ), #('Threshold', 'PCA', 'DPGMM' ), #('Threshold', 'PCAw', 'DPGMM' ) ] for k,sampl,srate,wndsize in selected: print('## processing %s'%k) if not 'annot' in sampl: labeler = Labeler(opt) labeler.prepare() labeler(sampl) fit, binarize = None, None #sampl, = [ h5[s] for s in ('/samples/data_psd_0.003300_200_simulated/', '/samples/data_psd_100.000000_200_simulated/') if s in h5 ] splitToInts = lambda x: [ int(i) for i in (m.strip() for m in x.split(',') if isString(m)) if i.isdigit() ] model = splitToInts(opt.model) if opt.model is not None else None legit = splitToInts(opt.legit) if opt.legit is not None else None malicious = splitToInts(opt.malicious) if opt.malicious is not None else None m,((fit, binarize, classes), res) = evaluate(opt, None, sampl,steps=steps,model=model,legit=legit,malicious=malicious) plot_roc(res,'ROC curves') if opt.tex: f = open(opt.tex,'a') try: f.write('\n') f.write(r''' \begin{table}[h] \begin{center} \begin{tabular}{c|cc} Method & $\overline{\mu_{auc}}$ & $\overline{\sigma_{auc}}$ \\ \hline %s \end{tabular} \end{center} \caption{Mean and standard deviation of the area under ROC curve.} \end{table} ''' % '\\\\ \hline\n'.join(('%s & %.3f & %.3f' % (name.replace('_','\_'),np.mean(auc),np.std(auc))) for name,auc,_ in res)) f.write('\n') finally: f.close() return m,((fit, binarize, classes), res)