def get_1dlbp_features(neighborhood):
    tf = TrainFiles(inp_path, floor = neighborhood * 2 + 1)
    inputs = tf.get_training_inputs()

    start = timer()
    hist = np.array([])
    outs = np.array([])

    i = 0
    writeBatch = 2000
    prep_out_path(out_path)

    p = 1 << np.array(range(0, 2 * neighborhood), dtype='int32')
    d_powers = cuda.to_device(p)

    for inp in inputs:

        data_file = path.join(inp_path, inp)

        out_file = path.join(out_path, path.splitext(inp)[0] + ext)
        arr = np.fromfile(data_file, dtype = 'uint8')

        ##GPU##
        file_hist = extract_1dlbp_gpu(arr, neighborhood, d_powers)

        ##CPU##
        #file_hist = extract_1dlbp_cpu(arr, neighborhood, p)
        #file_hist = file_histogram(file_hist, neighborhood)

        i += 1
        hist = append_to_arr(hist, file_hist)
        outs = append_to_arr(outs, out_file)

        if i == writeBatch:
            i = 0
            first = True
            print "Writing....."
            for j in range(0, outs.shape[0]):
                hist[j].tofile(outs[j])
            hist = np.array([])
            outs = np.array([])



    print "==============Done==================="
    print "Elapsed: ", timer() - start

    print "Writing......."

    for i in range(0, outs.shape[0]):
        hist[i].tofile(outs[i])

    print "==============Done==================="
예제 #2
0
def do_train():
    X, Y, Xt, Yt = TrainFiles.from_csv(csv_file)
    sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt)
    sl.fit_standard_scaler()

    #pca = PCA(250)
    #pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled])
    #X_pca = pca.transform(sl.X_train_scaled)
    #X_pca_test = pca.transform(sl.X_test_scaled)

    ##construct a dataset for RBM
    #X_rbm = X[:, 257:]
    #Xt_rbm = X[:, 257:]

    #rng = np.random.RandomState(123)
    #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng)

    #pretrain_lr = 0.1
    #k = 2
    #pretraining_epochs = 200
    #for epoch in xrange(pretraining_epochs):
    #    rbm.contrastive_divergence(lr=pretrain_lr, k=k)
    #    cost = rbm.get_reconstruction_cross_entropy()
    #    print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost

    trndata, tstdata = createDataSets(X, Y, X_test, Yt)
    fnn = train(trndata,
                tstdata,
                epochs=1000,
                test_error=0.025,
                momentum=0.15,
                weight_decay=0.0001)
예제 #3
0
def do_train():
    X, Y, Xt, Yt = TrainFiles.from_csv(csv_file)
    sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt)
    sl.fit_standard_scaler()

    #pca = PCA(250)
    #pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled])
    #X_pca = pca.transform(sl.X_train_scaled)
    #X_pca_test = pca.transform(sl.X_test_scaled)
    
    ##construct a dataset for RBM
    #X_rbm = X[:, 257:]
    #Xt_rbm = X[:, 257:]

    #rng = np.random.RandomState(123)
    #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng)

    #pretrain_lr = 0.1
    #k = 2
    #pretraining_epochs = 200
    #for epoch in xrange(pretraining_epochs):
    #    rbm.contrastive_divergence(lr=pretrain_lr, k=k)
    #    cost = rbm.get_reconstruction_cross_entropy()
    #    print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost


    trndata, tstdata = createDataSets(X, Y, X_test, Yt)
    fnn = train(trndata, tstdata, epochs = 1000, test_error = 0.025, momentum = 0.15, weight_decay = 0.0001)
예제 #4
0
def do_train_with_freq():
    tf_mix = TrainFiles(train_path = train_path_mix, labels_file = labels_file, test_size = 0.)
    tf_freq = TrainFiles(train_path = train_path_freq, labels_file = labels_file, test_size = 0.)

    X_m, Y_m, _, _ = tf_mix.prepare_inputs()
    X_f, Y_f, _, _ = tf_freq.prepare_inputs()

    X = np.c_[X_m, X_f]
    Y = Y_f

    X, Xt, Y, Yt = train_test_split(X, Y, test_size = 0.1)
    sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt)
    sl.fit_standard_scaler()

    pca = PCA(250)
    pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled])
    X_pca = pca.transform(sl.X_train_scaled)
    X_pca_test = pca.transform(sl.X_test_scaled)

    #sl.train_params = {'C': 100, 'gamma': 0.0001, 'probability' : True}
    #print "Start SVM: ", time_now_str()
    #sl_ll_trn, sl_ll_tst = sl.fit_and_validate()
    #print "Finish Svm: ", time_now_str()

    ##construct a dataset for RBM
    #X_rbm = X[:, 257:]
    #Xt_rbm = X[:, 257:]

    #rng = np.random.RandomState(123)
    #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng)

    #pretrain_lr = 0.1
    #k = 2
    #pretraining_epochs = 200
    #for epoch in xrange(pretraining_epochs):
    #    rbm.contrastive_divergence(lr=pretrain_lr, k=k)
    #    cost = rbm.get_reconstruction_cross_entropy()
    #    print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost


    trndata, tstdata = createDataSets(X_pca, Y, X_pca_test, Yt)
    fnn = train(trndata, tstdata, epochs = 1000, test_error = 0.025, momentum = 0.2, weight_decay = 0.0001)
예제 #5
0
def do_train_with_freq():
    tf_mix = TrainFiles(train_path=train_path_mix,
                        labels_file=labels_file,
                        test_size=0.)
    tf_freq = TrainFiles(train_path=train_path_freq,
                         labels_file=labels_file,
                         test_size=0.)

    X_m, Y_m, _, _ = tf_mix.prepare_inputs()
    X_f, Y_f, _, _ = tf_freq.prepare_inputs()

    X = np.c_[X_m, X_f]
    Y = Y_f

    X, Xt, Y, Yt = train_test_split(X, Y, test_size=0.1)
    sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt)
    sl.fit_standard_scaler()

    pca = PCA(250)
    pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled])
    X_pca = pca.transform(sl.X_train_scaled)
    X_pca_test = pca.transform(sl.X_test_scaled)

    #sl.train_params = {'C': 100, 'gamma': 0.0001, 'probability' : True}
    #print "Start SVM: ", time_now_str()
    #sl_ll_trn, sl_ll_tst = sl.fit_and_validate()
    #print "Finish Svm: ", time_now_str()

    ##construct a dataset for RBM
    #X_rbm = X[:, 257:]
    #Xt_rbm = X[:, 257:]

    #rng = np.random.RandomState(123)
    #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng)

    #pretrain_lr = 0.1
    #k = 2
    #pretraining_epochs = 200
    #for epoch in xrange(pretraining_epochs):
    #    rbm.contrastive_divergence(lr=pretrain_lr, k=k)
    #    cost = rbm.get_reconstruction_cross_entropy()
    #    print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost

    trndata, tstdata = createDataSets(X_pca, Y, X_pca_test, Yt)
    fnn = train(trndata,
                tstdata,
                epochs=1000,
                test_error=0.025,
                momentum=0.2,
                weight_decay=0.0001)
예제 #6
0
from SupervisedLearning import SKSupervisedLearning
from train_files import TrainFiles
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
from tr_utils import vote
import matplotlib.pylab as plt
from train_nn import createDataSets, train

train_path_mix = "/kaggle/malware/mix_lbp.csv"
labels_file = "/kaggle/malware/trainLabels.csv"

X, Y_train, Xt, Y_test = TrainFiles.from_csv(train_path_mix)


def plot_confusion(sl):
    conf_mat = confusion_matrix(sl.Y_test, sl.clf.predict(
        sl.X_test_scaled)).astype(dtype='float')
    norm_conf_mat = conf_mat / conf_mat.sum(axis=1)[:, None]

    fig = plt.figure()
    plt.clf()
    ax = fig.add_subplot(111)
    ax.set_aspect(1)
    res = ax.imshow(norm_conf_mat, cmap=plt.cm.jet, interpolation='nearest')
    cb = fig.colorbar(res)
    labs = np.unique(Y_test)
    x = labs - 1

    plt.xticks(x, labs)
예제 #7
0
from tr_utils import time_now_str

import sklearn.svm as svm
import numpy as np

from train_files import TrainFiles
from SupervisedLearning import SKSupervisedLearning
from sklearn.decomposition import PCA

out_labels = "/kaggle/malware/scratchpad/submission_fake.csv"
# instantiate file system interface
tf = TrainFiles('/kaggle/malware/scratchpad/train/1dlbp',
                '/kaggle/malware/scratchpad/test/1dlbp',
                "/kaggle/malware/trainLabels.csv")

# read in our data
X_train, Y_train, X_test, Y_test = tf.prepare_inputs()

sl = SKSupervisedLearning(svm.SVC, X_train, Y_train, X_test, Y_test)
sl.fit_standard_scaler()


def test_fit_svm():

    # start fitting
    sl.train_params = {'probability': True, 'C': 100, 'gamma': 0.1}

    print "Starting: ", time_now_str()
    # logloss is the score
    tscore, valscore = sl.fit_and_validate()
    print "Finished: ", time_now_str()
예제 #8
0
from tr_utils import time_now_str

import sklearn.svm as svm
import numpy as np

from train_files import TrainFiles
from SupervisedLearning import SKSupervisedLearning
from sklearn.decomposition import SparsePCA, PCA
from train_nn import train, createDataSets
from sklearn.naive_bayes import MultinomialNB
from sklearn.lda import LDA
from sklearn.preprocessing import normalize

tf = TrainFiles('/kaggle/malware/scratchpad/text/train/instr_freq', '/kaggle/malware/scratchpad/text/test/instr_freq', "/kaggle/malware/trainLabels.csv")
tf1 = TrainFiles('/kaggle/malware/scratchpad/train/1dlbp', '/kaggle/malware/scratchpad/test/1dlbp', "/kaggle/malware/trainLabels.csv")

X_train, Y_train, X_test, Y_test = tf1.prepare_inputs()

n_components = 300
pca = PCA(n_components = n_components)
pca.fit(np.r_[X_train, X_test])

#n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.99)[0][0]
#print n_components

#pca = PCA(n_components = n_components)
#pca.fit(np.r_[X_train, X_test])
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

예제 #9
0
def predict():
    tf = TrainFiles('/kaggle/malware/train/mix_lbp', val_path = '/kaggle/malware/test/mix_lbp', labels_file = "/kaggle/malware/trainLabels.csv")

    X_train, Y_train, X_test, Y_test = tf.prepare_inputs()

    sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test)
    sl_svm.fit_standard_scaler()
    sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True}

    print "Starting SVM: ", time_now_str()
    _, ll_svm = sl_svm.fit_and_validate()

    print "SVM score: {0:.4f}".format(ll_svm if not prediction else _)
    print "Finished training SVM: ", time_now_str()

    # neural net
    print "Starting NN: ", time_now_str()

    trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based = True)
    tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled)
    fnn = predict_nn(trndata)
    proba_nn = fnn.activateOnDataset(tstdata)

    print "Finished training NN: ", time_now_str()

    # no validation labels on actual prediction
    if doTrees:
        # random forest
        sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train, Y_train, X_test, Y_test)
        sl_ccrf.train_params = \
            {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10}
        sl_ccrf.fit_standard_scaler()

        print "Starting on RF: ", time_now_str()
        ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate()

        print "RF score: {0:.4f}".format(ll_ccrf_tst if not prediction else ll_ccrf_trn)
        sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob")
        sl_svm.proba_test.tofile("/temp/sl_svm.prob")
        proba_nn.tofile("/temp/nn.prob")

        print "Finished training RF: ", time_now_str()

    if prediction:
        proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn], [2./3., 1./6., 1./3.])

        out_labels = "/kaggle/malware/submission33.csv"
        task_labels = "/kaggle/malware/testLabels.csv"
        labels = [path.splitext(t)[0] for t in tf.get_val_inputs()]
        out = write_to_csv(task_labels, labels, proba, out_labels)

    else:
        # visualize the decision surface, projected down to the first
        # two principal components of the dataset
        pca = PCA(n_components=2).fit(sl_svm.X_train_scaled)

        X = pca.transform(sl_svm.X_train_scaled)

        x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1)
        y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1)

        xx, yy = np.meshgrid(x, y)

        # title for the plots
        titles = ['SVC with rbf kernel',
                  'Random Forest \n'
                  'n_components=7500',
                  'Decision Trees \n'
                  'n_components=7500']

        #plt.tight_layout()
        plt.figure(figsize=(12, 5))

        # predict and plot
        for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)):
            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, m_max]x[y_min, y_max].
            plt.subplot(1, 3, i + 1)
            clf.fit(X, Y_train)
            Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
            plt.axis('off')

            # Plot also the training points
            plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired)

            plt.title(titles[i])
        plt.tight_layout()
        plt.show()
        i += 1
        hist = append_to_arr(hist, file_hist)
        outs = append_to_arr(outs, out_file)

        if i == writeBatch:
            i = 0
            first = True
            for j in range(0, outs.shape[0]):
                hist[j].tofile(outs[j])
            hist = np.array([])
            outs = np.array([])



    print "==============Done==================="
    print "Elapsed: ", timer() - start

    print "Writing......."

    for i in range(0, outs.shape[0]):
        hist[i].tofile(outs[i])

    print "==============Done==================="

neighborhood = 4
get_1dlbp_features(neighborhood)
tf = TrainFiles(out_path, labels_file='/kaggle/retina/trainLabels.csv', test_size = 0.0)
X, Y, _, _ = tf.prepare_inputs()
tf.dump_to_csv(path.join(root_path, '1dlbp.csv'), X, Y)
        i += 1
        hist = append_to_arr(hist, file_hist)
        outs = append_to_arr(outs, out_file)

        if i == writeBatch:
            i = 0
            first = True
            for j in range(0, outs.shape[0]):
                hist[j].tofile(outs[j])
            hist = np.array([])
            outs = np.array([])

    print "==============Done==================="
    print "Elapsed: ", timer() - start

    print "Writing......."

    for i in range(0, outs.shape[0]):
        hist[i].tofile(outs[i])

    print "==============Done==================="


neighborhood = 4
get_1dlbp_features(neighborhood)
tf = TrainFiles(out_path,
                labels_file='/kaggle/retina/trainLabels.csv',
                test_size=0.0)
X, Y, _, _ = tf.prepare_inputs()
tf.dump_to_csv(path.join(root_path, '1dlbp.csv'), X, Y)
from SupervisedLearning import SKSupervisedLearning
from train_files import TrainFiles
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
from tr_utils import vote
import matplotlib.pylab as plt
from train_nn import createDataSets, train

train_path_mix = "/kaggle/malware/mix_lbp.csv"
labels_file = "/kaggle/malware/trainLabels.csv"

X, Y_train, Xt, Y_test = TrainFiles.from_csv(train_path_mix)

def plot_confusion(sl):
    conf_mat = confusion_matrix(sl.Y_test, sl.clf.predict(sl.X_test_scaled)).astype(dtype='float')
    norm_conf_mat = conf_mat / conf_mat.sum(axis = 1)[:, None]

    fig = plt.figure()
    plt.clf()
    ax = fig.add_subplot(111)
    ax.set_aspect(1)
    res = ax.imshow(norm_conf_mat, cmap=plt.cm.jet, 
                    interpolation='nearest')
    cb = fig.colorbar(res)
    labs = np.unique(Y_test)
    x = labs - 1

    plt.xticks(x, labs)
    plt.yticks(x, labs)
예제 #13
0
def predict():
    tf = TrainFiles('/kaggle/malware/train/mix_lbp',
                    val_path='/kaggle/malware/test/mix_lbp',
                    labels_file="/kaggle/malware/trainLabels.csv")

    X_train, Y_train, X_test, Y_test = tf.prepare_inputs()

    sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test)
    sl_svm.fit_standard_scaler()
    sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True}

    print "Starting SVM: ", time_now_str()
    _, ll_svm = sl_svm.fit_and_validate()

    print "SVM score: {0:.4f}".format(ll_svm if not prediction else _)
    print "Finished training SVM: ", time_now_str()

    # neural net
    print "Starting NN: ", time_now_str()

    trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based=True)
    tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled)
    fnn = predict_nn(trndata)
    proba_nn = fnn.activateOnDataset(tstdata)

    print "Finished training NN: ", time_now_str()

    # no validation labels on actual prediction
    if doTrees:
        # random forest
        sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train,
                                       Y_train, X_test, Y_test)
        sl_ccrf.train_params = \
            {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10}
        sl_ccrf.fit_standard_scaler()

        print "Starting on RF: ", time_now_str()
        ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate()

        print "RF score: {0:.4f}".format(
            ll_ccrf_tst if not prediction else ll_ccrf_trn)
        sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob")
        sl_svm.proba_test.tofile("/temp/sl_svm.prob")
        proba_nn.tofile("/temp/nn.prob")

        print "Finished training RF: ", time_now_str()

    if prediction:
        proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn],
                     [2. / 3., 1. / 6., 1. / 3.])

        out_labels = "/kaggle/malware/submission33.csv"
        task_labels = "/kaggle/malware/testLabels.csv"
        labels = [path.splitext(t)[0] for t in tf.get_val_inputs()]
        out = write_to_csv(task_labels, labels, proba, out_labels)

    else:
        # visualize the decision surface, projected down to the first
        # two principal components of the dataset
        pca = PCA(n_components=2).fit(sl_svm.X_train_scaled)

        X = pca.transform(sl_svm.X_train_scaled)

        x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1)
        y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1)

        xx, yy = np.meshgrid(x, y)

        # title for the plots
        titles = [
            'SVC with rbf kernel', 'Random Forest \n'
            'n_components=7500', 'Decision Trees \n'
            'n_components=7500'
        ]

        #plt.tight_layout()
        plt.figure(figsize=(12, 5))

        # predict and plot
        for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)):
            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, m_max]x[y_min, y_max].
            plt.subplot(1, 3, i + 1)
            clf.fit(X, Y_train)
            Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
            plt.axis('off')

            # Plot also the training points
            plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired)

            plt.title(titles[i])
        plt.tight_layout()
        plt.show()