def random_score(n_iter=20, frac=0.2, dataset=ds.NSL_TRAIN):
    data = NSL(dataset, ds.ENC_NUMERIC, ds.SCL_NONE).ds
    scores = np.array([0 for _ in xrange(data.shape[1])]).astype(float)
    for _ in xrange(n_iter):
        score = features_imbalance3(data)
        scores = scores + score
    return scores / n_iter
def test(data=ds.NSL_TRAIN20, frac=0.1, n_clusters=9):
    nsl = NSL(data, ds.ENC_NUMERIC, ds.SCL_NONE)
    data = pd.DataFrame(StandardScaler().fit_transform(
        nsl.ds.sample(frac=frac)),
                        columns=nsl.ds.columns)
    scores = features_imbalance3(data)
    #scores = [4.9687348601488857e-06, 0.048005449773971885, 0.30090289606345372, 0.039999999997479371, 0.000583154682335534, 0.071427437376211239, 0.0006369753925856707, 0.0064933003025774748, 2.0059375750962523e-05, 0.12727121443046172, 0.0059171597629407305, 0.0071941874935393642, 0.11714027214813312, 0.0010052146319188752, 0.019230616569268669, 0.0019378614210404626, 0.0, 0.99999999993698407, 0.00084245998309771147, 9.3999531402734481e-08, 2.9174852834226813e-07, 4.1833931281974199e-07, 3.8195776838186883e-07, 1.1509224936799674e-06, 1.6633409340012933e-06, 1.2001268628255685e-07, 1.4342506054173522e-06, 2.8705243184823012e-06, 4.3492993562524208e-08, 6.8499758177045603e-08, 1.5217451831180736e-07, 9.5622234800440755e-07, 5.3457550733435897e-07, 4.43769710268333e-06, 2.7885047378256104e-07, 2.9052731267311483e-07, 6.6749552744942154e-07, 6.5967876419635182e-07]
    # scores2 = np.sqrt(np.array(scores).astype(float))
    for i, col in enumerate(nsl.ds.columns):
        if scores[i] is not None and not np.isnan(
                scores[i]) and scores[i] != 0:
            data[col] = data[col] / scores[i]
    else:
        nsl.ds[col] = 0
    km = KMeans(n_clusters=n_clusters, random_state=0)
    res = km.fit_predict(data)
    labels, counts = np.unique(res, return_counts=True)
    return counts
示例#3
0
def create_clustering_models():
    models = []
    nsl_features = [
        NSL.FEATURES_2SECS_HOST, NSL.FEATURES_2SECS_SERVICE,
        NSL.FEATURES_100CONNS_HOST, NSL.FEATURES_EXPERT, NSL.FEATURES_TCP,
        np.append(NSL.FEATURES_2SECS_HOST,
                  NSL.FEATURES_2SECS_SERVICE), NSL.FEATURES_2SECS, NSL.FEATURES
    ]
    nsl_descs = [
        '2 secs same dest host', '2 secs same service',
        '100 connections same host', 'expert features', 'single TCP features',
        'all 2 secs', 'all history based features', 'all features'
    ]
    for dataset in (ds.NSL_TRAIN20, ds.NSL_TEST):
        for encoding in (ds.ENC_NUMERIC, ds.ENC_HOT):
            for scaling in (ds.SCL_NONE, ds.SCL_MINMAX, ds.SCL_STD):
                nsl = NSL(dataset, encoding, scaling)
                models.append(CM(nsl).gen_model(MultiPart, seed=0))
                models.append(CM(nsl).gen_model(WKMeans, random_state=0))
                models.append(
                    CM(nsl).gen_model(EXLasso,
                                      random_state=0,
                                      gamma=0.1,
                                      tol=1e-2,
                                      verbose=True))
                models.append(
                    CM(nsl).gen_model(KMeansBal,
                                      random_state=0,
                                      clusters_factor=5))

        for encoding in (ds.ENC_NUMERIC, ds.ENC_HOT):
            for scaling in (ds.SCL_NONE, ds.SCL_MINMAX, ds.SCL_STD):
                nsl = NSL(dataset, encoding, scaling)

                # The following splitters are features agnostic. However, they
                # are added for every encoding and scaling since it matters for
                # the ML classifiers later on in the process
                models.append(CM(nsl).gen_model(RoundRobin))
                models.append(CM(nsl).gen_model(RandomRoundRobin))
                models.append(CM(nsl, min_k=1, max_k=1).gen_model(NoSplit))

                # Add all clustering models
                for f, d in zip(nsl_features, nsl_descs):
                    models.append(
                        CM(nsl, f, d).gen_model(KMeans, random_state=0))

    for model in models:
        file_name = "%s_%s_%s_%s_%s.dmp" % (
            model.algorithm,
            model.features_desc,
            model.dataset.ds_name,
            model.dataset.encoding,
            model.dataset.scaling,
        )
        if (isfile(os.path.join(MODELS_DIR, file_name))):
            print("Skipping %s" % file_name)
            continue
        print("Running model %s %s %s %s %s" %
              (model.dataset.ds_name, model.dataset.encoding,
               model.dataset.scaling, model.algorithm, model.features_desc))
        model.run()
        model.save(os.path.join(MODELS_DIR, file_name))
示例#4
0
def eval_classifiers(report_file=CLASSIFIERS_REPORT,
                     classifiers=None,
                     clusters_file=CLUSTERS_REPORT):
    clfmap = {'NN': nn, 'DT': dt3, 'RF': rf3, 'MLP': mlp, 'SVM': svmlin}
    classifiers_ = []
    for clf in classifiers:
        classifiers_.append(clfmap[clf])

    clusters = pd.read_csv(clusters_file)
    csv_file = open(report_file, 'wb')
    csvwriter = csv.writer(csv_file)
    header = [
        'Dataset', 'Encoding', 'Scaling', 'Algo', 'Features', 'K',
        'Split Sizes', 'Classifier', 'Classifier Info', 'K-fold',
        'Training Time', 'Testing Time', 'F-Score', 'Precision', 'Recall'
    ]
    labels = NSL.standard_labels()
    for a in labels:
        for b in labels:
            header.append("True %s, Predicted %s" % (a, b))
    for a in labels:
        header.append("AUC %s" % a)
    csvwriter.writerow(header)

    for i, row in clusters.iterrows():
        if row['Valid?'] is not True or row['Other DS valid?'] is not True:
            continue
        algo = row['Algorithm']
        features = row['Features']
        dataset = row['Dataset']
        encoding = row['Encoding']
        scaling = row['Scaling']

        model_file = '%s_%s_%s_%s_%s.dmp' % (algo, features, dataset, encoding,
                                             scaling)
        model_file = join(MODELS_DIR, model_file)
        if not isfile(model_file):
            print('Model %s was not found!' % model_file)
            continue

        data = pickle.load(open(model_file, 'rb'))
        ds_ = NSL(dataset, scaling=scaling, encoding=encoding)
        for classifier in classifiers_:
            # Evaluate SVM only when min-max scaled (time constraint)
            if classifier.name == 'SVM' and scaling != 'Min-max':
                continue

            dump_file = '%s_%s_%s_%s_%s_%s.dmp' % (
                algo, features, dataset, encoding, scaling, classifier.name)

            results = []
            if (isfile(join(CLFS_DIR, dump_file))):
                print('Classifier %s already exists, loading results from it' %
                      dump_file)
                results = pickle.load(open(join(CLFS_DIR, dump_file), 'rb'))
            else:
                print('Working on %s' % dump_file)
                ev = EvalClassifier(ds_, data, classifier, calc_prob=True)
                if ev.eval():
                    pickle.dump(ev.results,
                                open(join(CLFS_DIR, dump_file), 'wb'))
                    results = ev.results
                else:
                    print("Error evaluating %s" % dump_file)
                    continue

            # Create report
            for i, res in enumerate(results):
                line = [
                    dataset, encoding, scaling, algo, features, data[i]['k'],
                    ' - '.join(map(lambda x: str(x), data[i]['SPLIT_SIZES'])),
                    classifier.name, classifier.info, 5,
                    '%.2f' % res[EV_TIME_TRN],
                    '%.2f' % res[EV_TIME_TST],
                    '%.2f' % res[EV_FSCORE],
                    '%.2f' % res[EV_PRE],
                    '%.2f' % res[EV_REC]
                ]

                line = np.append(line, res[EV_CM].flatten())
                if EV_AUC in res:
                    for lbl in NSL.standard_labels():
                        line = np.append(line, '%.2f' % res[EV_AUC][lbl])

                csvwriter.writerow(line)
                csv_file.close()
                csv_file = open(report_file, 'ab')
                csvwriter = csv.writer(csv_file)
    csv_file.close()

def balance_score(ds, r=range(5, 6)):
    minmax_ratios = []
    for i in r:
        km = KMeans(random_state=0, n_clusters=i)
        result = km.fit_predict(ds)
        labels, counts = np.unique(result, return_counts=True)
        minmax_ratios.append(float(np.max(counts)) / np.min(counts))

    #score = np.average(minmax_ratios, weights=r)
    score = np.max(minmax_ratios)
    return score


nsl = NSL(ds.NSL_TRAIN20, ds.ENC_NUMERIC, ds.SCL_NONE)

best_known_features = [
    'num_access_files', 'num_compromised', 'rerror_rate', 'urgent',
    'dst_host_same_srv_rate', 'dst_host_srv_rerror_rate', 'srv_serror_rate',
    'is_host_login', 'wrong_fragment', 'serror_rate', 'num_shells',
    'num_outbound_cmds', 'is_guest_login', 'dst_host_rerror_rate',
    'dst_host_srv_serror_rate', 'hot', 'dst_host_srv_count', 'logged_in',
    'srv_rerror_rate', 'dst_host_srv_diff_host_rate', 'num_root',
    'dst_host_same_src_port_rate', 'root_shell', 'su_attempted',
    'dst_host_count', 'num_file_creations', 'count', 'land', 'same_srv_rate',
    'dst_host_diff_srv_rate', 'srv_diff_host_rate', 'diff_srv_rate',
    'num_failed_logins', 'dst_host_serror_rate'
]

i = 0
示例#6
0
        # Solve Y
        V = 1 / (2 + mu) * (2 * X.T.dot(W) + 2 * np.ones(
            (n, 1)) * b.T + mu * Z - Lambda)
        ind = np.argmax(V, axis=1)
        Y = np.zeros((n, c))
        for i, i2 in enumerate(ind):
            Y[i][i2] = 1

        # Update Lambda and mu according to ALM
        Lambda = Lambda + mu * (Y - Z)
        mu = np.min([mu * rho, 100000])

        # Objective value
        val = E.T.dot(E).trace() + gamma*(W.T.dot(W).trace()) + \
            lam*Y.T.dot(np.ones((n, n))).dot(Y).trace()
        objs.append(val.tolist()[0][0])
    return (Y)


samples = 100
clusters = 5
x = NSL(ds.NSL_TRAIN20, encoding=ds.ENC_NUMERIC, scaling=ds.SCL_NONE)
x = x.ds.iloc[range(0, samples)]
x = x - x.mean()
y = np.zeros((samples, clusters))
for i in xrange(samples):
    y[i][np.random.randint(clusters)] = 1
    #y[i][1] = 1
res = bcls_alm(x, y, 0.01, 0.3, 0.1)
print(res)
示例#7
0
    def cross_val(ds,
                  clf,
                  kfold,
                  classes=NSL.standard_labels(),
                  calc_prob=True):
        labels = NSL.get_labels(ds)

        # skf = StratifiedKFold(labels, n_splits=kfold) # SciKit 0.17
        skf = StratifiedKFold(n_splits=kfold, random_state=0)
        trn_timer = 0
        tst_timer = 0

        global_true = pd.core.series.Series()
        global_pred = np.array([])
        global_prob = None

        # for train_index, test_index in skf: # SciKit 0.17
        for train_index, test_index in skf.split(ds, labels):
            X_train, X_test = ds.iloc[train_index], ds.iloc[test_index]
            Y_train, Y_test = labels.iloc[train_index], labels.iloc[test_index]
            # local_clf = clone(clf)
            local_clf = clf.clone()

            start = time.clock()
            local_clf.fit(X_train, Y_train)
            trn_timer += time.clock() - start

            start = time.clock()
            y_pred = local_clf.predict(X_test)
            if (calc_prob):
                y_prob = local_clf.predict_proba(X_test)
            tst_timer += time.clock() - start

            global_true = global_true.append(Y_test)
            global_pred = np.append(global_pred, y_pred)

            if (calc_prob):
                # If local classifier didn't learn all classes we need to "pad"
                # probabilities matrix with zeros
                if (len(local_clf.classes_) != len(classes)):
                    for i, cls in enumerate(classes):
                        if cls in local_clf.classes_:
                            column = y_prob[:,
                                            np.where(local_clf.classes_ ==
                                                     cls)[0][0]]
                        else:
                            column = np.zeros((y_prob.shape[0], 1))

                        if i == 0:
                            tmp_prob = column
                        else:
                            tmp_prob = np.column_stack((tmp_prob, column))
                    y_prob = tmp_prob
                if global_prob is None:
                    global_prob = y_prob
                else:
                    global_prob = np.row_stack((global_prob, y_prob))

        result = {
            EV_PRED: global_pred,
            EV_PROB: global_prob,
            EV_TRUE: global_true,
            EV_TIME_TRN: trn_timer,
            EV_TIME_TST: tst_timer
        }

        result.update(
            EvalClassifier.calc_scores(global_true,
                                       global_pred,
                                       global_prob,
                                       classes,
                                       calc_roc=False))
        return result
示例#8
0
import numpy as np
import dataset as ds
from dataset import NSL
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.utils.validation import check_array
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

from exlasso import _exlasso
from splitters import EXLasso

data = NSL(ds.NSL_TRAIN20, ds.ENC_NUMERIC, ds.SCL_MINMAX)

X = check_array(data.ds, order="C")

n_clusters = 2


ex = EXLasso(n_clusters, init='random', gamma=0.3)
#res = ex.fit_predict(X)
#print(np.unique(res, return_counts=True))

np.random.seed(0)
n_samples = 10000

varied = datasets.make_blobs(n_samples=n_samples,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=175, n_features=3)