def random_score(n_iter=20, frac=0.2, dataset=ds.NSL_TRAIN): data = NSL(dataset, ds.ENC_NUMERIC, ds.SCL_NONE).ds scores = np.array([0 for _ in xrange(data.shape[1])]).astype(float) for _ in xrange(n_iter): score = features_imbalance3(data) scores = scores + score return scores / n_iter
def test(data=ds.NSL_TRAIN20, frac=0.1, n_clusters=9): nsl = NSL(data, ds.ENC_NUMERIC, ds.SCL_NONE) data = pd.DataFrame(StandardScaler().fit_transform( nsl.ds.sample(frac=frac)), columns=nsl.ds.columns) scores = features_imbalance3(data) #scores = [4.9687348601488857e-06, 0.048005449773971885, 0.30090289606345372, 0.039999999997479371, 0.000583154682335534, 0.071427437376211239, 0.0006369753925856707, 0.0064933003025774748, 2.0059375750962523e-05, 0.12727121443046172, 0.0059171597629407305, 0.0071941874935393642, 0.11714027214813312, 0.0010052146319188752, 0.019230616569268669, 0.0019378614210404626, 0.0, 0.99999999993698407, 0.00084245998309771147, 9.3999531402734481e-08, 2.9174852834226813e-07, 4.1833931281974199e-07, 3.8195776838186883e-07, 1.1509224936799674e-06, 1.6633409340012933e-06, 1.2001268628255685e-07, 1.4342506054173522e-06, 2.8705243184823012e-06, 4.3492993562524208e-08, 6.8499758177045603e-08, 1.5217451831180736e-07, 9.5622234800440755e-07, 5.3457550733435897e-07, 4.43769710268333e-06, 2.7885047378256104e-07, 2.9052731267311483e-07, 6.6749552744942154e-07, 6.5967876419635182e-07] # scores2 = np.sqrt(np.array(scores).astype(float)) for i, col in enumerate(nsl.ds.columns): if scores[i] is not None and not np.isnan( scores[i]) and scores[i] != 0: data[col] = data[col] / scores[i] else: nsl.ds[col] = 0 km = KMeans(n_clusters=n_clusters, random_state=0) res = km.fit_predict(data) labels, counts = np.unique(res, return_counts=True) return counts
def create_clustering_models(): models = [] nsl_features = [ NSL.FEATURES_2SECS_HOST, NSL.FEATURES_2SECS_SERVICE, NSL.FEATURES_100CONNS_HOST, NSL.FEATURES_EXPERT, NSL.FEATURES_TCP, np.append(NSL.FEATURES_2SECS_HOST, NSL.FEATURES_2SECS_SERVICE), NSL.FEATURES_2SECS, NSL.FEATURES ] nsl_descs = [ '2 secs same dest host', '2 secs same service', '100 connections same host', 'expert features', 'single TCP features', 'all 2 secs', 'all history based features', 'all features' ] for dataset in (ds.NSL_TRAIN20, ds.NSL_TEST): for encoding in (ds.ENC_NUMERIC, ds.ENC_HOT): for scaling in (ds.SCL_NONE, ds.SCL_MINMAX, ds.SCL_STD): nsl = NSL(dataset, encoding, scaling) models.append(CM(nsl).gen_model(MultiPart, seed=0)) models.append(CM(nsl).gen_model(WKMeans, random_state=0)) models.append( CM(nsl).gen_model(EXLasso, random_state=0, gamma=0.1, tol=1e-2, verbose=True)) models.append( CM(nsl).gen_model(KMeansBal, random_state=0, clusters_factor=5)) for encoding in (ds.ENC_NUMERIC, ds.ENC_HOT): for scaling in (ds.SCL_NONE, ds.SCL_MINMAX, ds.SCL_STD): nsl = NSL(dataset, encoding, scaling) # The following splitters are features agnostic. However, they # are added for every encoding and scaling since it matters for # the ML classifiers later on in the process models.append(CM(nsl).gen_model(RoundRobin)) models.append(CM(nsl).gen_model(RandomRoundRobin)) models.append(CM(nsl, min_k=1, max_k=1).gen_model(NoSplit)) # Add all clustering models for f, d in zip(nsl_features, nsl_descs): models.append( CM(nsl, f, d).gen_model(KMeans, random_state=0)) for model in models: file_name = "%s_%s_%s_%s_%s.dmp" % ( model.algorithm, model.features_desc, model.dataset.ds_name, model.dataset.encoding, model.dataset.scaling, ) if (isfile(os.path.join(MODELS_DIR, file_name))): print("Skipping %s" % file_name) continue print("Running model %s %s %s %s %s" % (model.dataset.ds_name, model.dataset.encoding, model.dataset.scaling, model.algorithm, model.features_desc)) model.run() model.save(os.path.join(MODELS_DIR, file_name))
def eval_classifiers(report_file=CLASSIFIERS_REPORT, classifiers=None, clusters_file=CLUSTERS_REPORT): clfmap = {'NN': nn, 'DT': dt3, 'RF': rf3, 'MLP': mlp, 'SVM': svmlin} classifiers_ = [] for clf in classifiers: classifiers_.append(clfmap[clf]) clusters = pd.read_csv(clusters_file) csv_file = open(report_file, 'wb') csvwriter = csv.writer(csv_file) header = [ 'Dataset', 'Encoding', 'Scaling', 'Algo', 'Features', 'K', 'Split Sizes', 'Classifier', 'Classifier Info', 'K-fold', 'Training Time', 'Testing Time', 'F-Score', 'Precision', 'Recall' ] labels = NSL.standard_labels() for a in labels: for b in labels: header.append("True %s, Predicted %s" % (a, b)) for a in labels: header.append("AUC %s" % a) csvwriter.writerow(header) for i, row in clusters.iterrows(): if row['Valid?'] is not True or row['Other DS valid?'] is not True: continue algo = row['Algorithm'] features = row['Features'] dataset = row['Dataset'] encoding = row['Encoding'] scaling = row['Scaling'] model_file = '%s_%s_%s_%s_%s.dmp' % (algo, features, dataset, encoding, scaling) model_file = join(MODELS_DIR, model_file) if not isfile(model_file): print('Model %s was not found!' % model_file) continue data = pickle.load(open(model_file, 'rb')) ds_ = NSL(dataset, scaling=scaling, encoding=encoding) for classifier in classifiers_: # Evaluate SVM only when min-max scaled (time constraint) if classifier.name == 'SVM' and scaling != 'Min-max': continue dump_file = '%s_%s_%s_%s_%s_%s.dmp' % ( algo, features, dataset, encoding, scaling, classifier.name) results = [] if (isfile(join(CLFS_DIR, dump_file))): print('Classifier %s already exists, loading results from it' % dump_file) results = pickle.load(open(join(CLFS_DIR, dump_file), 'rb')) else: print('Working on %s' % dump_file) ev = EvalClassifier(ds_, data, classifier, calc_prob=True) if ev.eval(): pickle.dump(ev.results, open(join(CLFS_DIR, dump_file), 'wb')) results = ev.results else: print("Error evaluating %s" % dump_file) continue # Create report for i, res in enumerate(results): line = [ dataset, encoding, scaling, algo, features, data[i]['k'], ' - '.join(map(lambda x: str(x), data[i]['SPLIT_SIZES'])), classifier.name, classifier.info, 5, '%.2f' % res[EV_TIME_TRN], '%.2f' % res[EV_TIME_TST], '%.2f' % res[EV_FSCORE], '%.2f' % res[EV_PRE], '%.2f' % res[EV_REC] ] line = np.append(line, res[EV_CM].flatten()) if EV_AUC in res: for lbl in NSL.standard_labels(): line = np.append(line, '%.2f' % res[EV_AUC][lbl]) csvwriter.writerow(line) csv_file.close() csv_file = open(report_file, 'ab') csvwriter = csv.writer(csv_file) csv_file.close()
def balance_score(ds, r=range(5, 6)): minmax_ratios = [] for i in r: km = KMeans(random_state=0, n_clusters=i) result = km.fit_predict(ds) labels, counts = np.unique(result, return_counts=True) minmax_ratios.append(float(np.max(counts)) / np.min(counts)) #score = np.average(minmax_ratios, weights=r) score = np.max(minmax_ratios) return score nsl = NSL(ds.NSL_TRAIN20, ds.ENC_NUMERIC, ds.SCL_NONE) best_known_features = [ 'num_access_files', 'num_compromised', 'rerror_rate', 'urgent', 'dst_host_same_srv_rate', 'dst_host_srv_rerror_rate', 'srv_serror_rate', 'is_host_login', 'wrong_fragment', 'serror_rate', 'num_shells', 'num_outbound_cmds', 'is_guest_login', 'dst_host_rerror_rate', 'dst_host_srv_serror_rate', 'hot', 'dst_host_srv_count', 'logged_in', 'srv_rerror_rate', 'dst_host_srv_diff_host_rate', 'num_root', 'dst_host_same_src_port_rate', 'root_shell', 'su_attempted', 'dst_host_count', 'num_file_creations', 'count', 'land', 'same_srv_rate', 'dst_host_diff_srv_rate', 'srv_diff_host_rate', 'diff_srv_rate', 'num_failed_logins', 'dst_host_serror_rate' ] i = 0
# Solve Y V = 1 / (2 + mu) * (2 * X.T.dot(W) + 2 * np.ones( (n, 1)) * b.T + mu * Z - Lambda) ind = np.argmax(V, axis=1) Y = np.zeros((n, c)) for i, i2 in enumerate(ind): Y[i][i2] = 1 # Update Lambda and mu according to ALM Lambda = Lambda + mu * (Y - Z) mu = np.min([mu * rho, 100000]) # Objective value val = E.T.dot(E).trace() + gamma*(W.T.dot(W).trace()) + \ lam*Y.T.dot(np.ones((n, n))).dot(Y).trace() objs.append(val.tolist()[0][0]) return (Y) samples = 100 clusters = 5 x = NSL(ds.NSL_TRAIN20, encoding=ds.ENC_NUMERIC, scaling=ds.SCL_NONE) x = x.ds.iloc[range(0, samples)] x = x - x.mean() y = np.zeros((samples, clusters)) for i in xrange(samples): y[i][np.random.randint(clusters)] = 1 #y[i][1] = 1 res = bcls_alm(x, y, 0.01, 0.3, 0.1) print(res)
def cross_val(ds, clf, kfold, classes=NSL.standard_labels(), calc_prob=True): labels = NSL.get_labels(ds) # skf = StratifiedKFold(labels, n_splits=kfold) # SciKit 0.17 skf = StratifiedKFold(n_splits=kfold, random_state=0) trn_timer = 0 tst_timer = 0 global_true = pd.core.series.Series() global_pred = np.array([]) global_prob = None # for train_index, test_index in skf: # SciKit 0.17 for train_index, test_index in skf.split(ds, labels): X_train, X_test = ds.iloc[train_index], ds.iloc[test_index] Y_train, Y_test = labels.iloc[train_index], labels.iloc[test_index] # local_clf = clone(clf) local_clf = clf.clone() start = time.clock() local_clf.fit(X_train, Y_train) trn_timer += time.clock() - start start = time.clock() y_pred = local_clf.predict(X_test) if (calc_prob): y_prob = local_clf.predict_proba(X_test) tst_timer += time.clock() - start global_true = global_true.append(Y_test) global_pred = np.append(global_pred, y_pred) if (calc_prob): # If local classifier didn't learn all classes we need to "pad" # probabilities matrix with zeros if (len(local_clf.classes_) != len(classes)): for i, cls in enumerate(classes): if cls in local_clf.classes_: column = y_prob[:, np.where(local_clf.classes_ == cls)[0][0]] else: column = np.zeros((y_prob.shape[0], 1)) if i == 0: tmp_prob = column else: tmp_prob = np.column_stack((tmp_prob, column)) y_prob = tmp_prob if global_prob is None: global_prob = y_prob else: global_prob = np.row_stack((global_prob, y_prob)) result = { EV_PRED: global_pred, EV_PROB: global_prob, EV_TRUE: global_true, EV_TIME_TRN: trn_timer, EV_TIME_TST: tst_timer } result.update( EvalClassifier.calc_scores(global_true, global_pred, global_prob, classes, calc_roc=False)) return result
import numpy as np import dataset as ds from dataset import NSL import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.utils.validation import check_array from sklearn import datasets from sklearn.preprocessing import StandardScaler from itertools import cycle, islice from exlasso import _exlasso from splitters import EXLasso data = NSL(ds.NSL_TRAIN20, ds.ENC_NUMERIC, ds.SCL_MINMAX) X = check_array(data.ds, order="C") n_clusters = 2 ex = EXLasso(n_clusters, init='random', gamma=0.3) #res = ex.fit_predict(X) #print(np.unique(res, return_counts=True)) np.random.seed(0) n_samples = 10000 varied = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=175, n_features=3)