def get_1dlbp_features(neighborhood): tf = TrainFiles(inp_path, floor = neighborhood * 2 + 1) inputs = tf.get_training_inputs() start = timer() hist = np.array([]) outs = np.array([]) i = 0 writeBatch = 2000 prep_out_path(out_path) p = 1 << np.array(range(0, 2 * neighborhood), dtype='int32') d_powers = cuda.to_device(p) for inp in inputs: data_file = path.join(inp_path, inp) out_file = path.join(out_path, path.splitext(inp)[0] + ext) arr = np.fromfile(data_file, dtype = 'uint8') ##GPU## file_hist = extract_1dlbp_gpu(arr, neighborhood, d_powers) ##CPU## #file_hist = extract_1dlbp_cpu(arr, neighborhood, p) #file_hist = file_histogram(file_hist, neighborhood) i += 1 hist = append_to_arr(hist, file_hist) outs = append_to_arr(outs, out_file) if i == writeBatch: i = 0 first = True print "Writing....." for j in range(0, outs.shape[0]): hist[j].tofile(outs[j]) hist = np.array([]) outs = np.array([]) print "==============Done===================" print "Elapsed: ", timer() - start print "Writing......." for i in range(0, outs.shape[0]): hist[i].tofile(outs[i]) print "==============Done==================="
def do_train(): X, Y, Xt, Yt = TrainFiles.from_csv(csv_file) sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt) sl.fit_standard_scaler() #pca = PCA(250) #pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled]) #X_pca = pca.transform(sl.X_train_scaled) #X_pca_test = pca.transform(sl.X_test_scaled) ##construct a dataset for RBM #X_rbm = X[:, 257:] #Xt_rbm = X[:, 257:] #rng = np.random.RandomState(123) #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng) #pretrain_lr = 0.1 #k = 2 #pretraining_epochs = 200 #for epoch in xrange(pretraining_epochs): # rbm.contrastive_divergence(lr=pretrain_lr, k=k) # cost = rbm.get_reconstruction_cross_entropy() # print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost trndata, tstdata = createDataSets(X, Y, X_test, Yt) fnn = train(trndata, tstdata, epochs=1000, test_error=0.025, momentum=0.15, weight_decay=0.0001)
def do_train(): X, Y, Xt, Yt = TrainFiles.from_csv(csv_file) sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt) sl.fit_standard_scaler() #pca = PCA(250) #pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled]) #X_pca = pca.transform(sl.X_train_scaled) #X_pca_test = pca.transform(sl.X_test_scaled) ##construct a dataset for RBM #X_rbm = X[:, 257:] #Xt_rbm = X[:, 257:] #rng = np.random.RandomState(123) #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng) #pretrain_lr = 0.1 #k = 2 #pretraining_epochs = 200 #for epoch in xrange(pretraining_epochs): # rbm.contrastive_divergence(lr=pretrain_lr, k=k) # cost = rbm.get_reconstruction_cross_entropy() # print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost trndata, tstdata = createDataSets(X, Y, X_test, Yt) fnn = train(trndata, tstdata, epochs = 1000, test_error = 0.025, momentum = 0.15, weight_decay = 0.0001)
def do_train_with_freq(): tf_mix = TrainFiles(train_path = train_path_mix, labels_file = labels_file, test_size = 0.) tf_freq = TrainFiles(train_path = train_path_freq, labels_file = labels_file, test_size = 0.) X_m, Y_m, _, _ = tf_mix.prepare_inputs() X_f, Y_f, _, _ = tf_freq.prepare_inputs() X = np.c_[X_m, X_f] Y = Y_f X, Xt, Y, Yt = train_test_split(X, Y, test_size = 0.1) sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt) sl.fit_standard_scaler() pca = PCA(250) pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled]) X_pca = pca.transform(sl.X_train_scaled) X_pca_test = pca.transform(sl.X_test_scaled) #sl.train_params = {'C': 100, 'gamma': 0.0001, 'probability' : True} #print "Start SVM: ", time_now_str() #sl_ll_trn, sl_ll_tst = sl.fit_and_validate() #print "Finish Svm: ", time_now_str() ##construct a dataset for RBM #X_rbm = X[:, 257:] #Xt_rbm = X[:, 257:] #rng = np.random.RandomState(123) #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng) #pretrain_lr = 0.1 #k = 2 #pretraining_epochs = 200 #for epoch in xrange(pretraining_epochs): # rbm.contrastive_divergence(lr=pretrain_lr, k=k) # cost = rbm.get_reconstruction_cross_entropy() # print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost trndata, tstdata = createDataSets(X_pca, Y, X_pca_test, Yt) fnn = train(trndata, tstdata, epochs = 1000, test_error = 0.025, momentum = 0.2, weight_decay = 0.0001)
def do_train_with_freq(): tf_mix = TrainFiles(train_path=train_path_mix, labels_file=labels_file, test_size=0.) tf_freq = TrainFiles(train_path=train_path_freq, labels_file=labels_file, test_size=0.) X_m, Y_m, _, _ = tf_mix.prepare_inputs() X_f, Y_f, _, _ = tf_freq.prepare_inputs() X = np.c_[X_m, X_f] Y = Y_f X, Xt, Y, Yt = train_test_split(X, Y, test_size=0.1) sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt) sl.fit_standard_scaler() pca = PCA(250) pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled]) X_pca = pca.transform(sl.X_train_scaled) X_pca_test = pca.transform(sl.X_test_scaled) #sl.train_params = {'C': 100, 'gamma': 0.0001, 'probability' : True} #print "Start SVM: ", time_now_str() #sl_ll_trn, sl_ll_tst = sl.fit_and_validate() #print "Finish Svm: ", time_now_str() ##construct a dataset for RBM #X_rbm = X[:, 257:] #Xt_rbm = X[:, 257:] #rng = np.random.RandomState(123) #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng) #pretrain_lr = 0.1 #k = 2 #pretraining_epochs = 200 #for epoch in xrange(pretraining_epochs): # rbm.contrastive_divergence(lr=pretrain_lr, k=k) # cost = rbm.get_reconstruction_cross_entropy() # print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost trndata, tstdata = createDataSets(X_pca, Y, X_pca_test, Yt) fnn = train(trndata, tstdata, epochs=1000, test_error=0.025, momentum=0.2, weight_decay=0.0001)
from SupervisedLearning import SKSupervisedLearning from train_files import TrainFiles from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.metrics import log_loss, confusion_matrix from sklearn.calibration import CalibratedClassifierCV from tr_utils import vote import matplotlib.pylab as plt from train_nn import createDataSets, train train_path_mix = "/kaggle/malware/mix_lbp.csv" labels_file = "/kaggle/malware/trainLabels.csv" X, Y_train, Xt, Y_test = TrainFiles.from_csv(train_path_mix) def plot_confusion(sl): conf_mat = confusion_matrix(sl.Y_test, sl.clf.predict( sl.X_test_scaled)).astype(dtype='float') norm_conf_mat = conf_mat / conf_mat.sum(axis=1)[:, None] fig = plt.figure() plt.clf() ax = fig.add_subplot(111) ax.set_aspect(1) res = ax.imshow(norm_conf_mat, cmap=plt.cm.jet, interpolation='nearest') cb = fig.colorbar(res) labs = np.unique(Y_test) x = labs - 1 plt.xticks(x, labs)
from tr_utils import time_now_str import sklearn.svm as svm import numpy as np from train_files import TrainFiles from SupervisedLearning import SKSupervisedLearning from sklearn.decomposition import PCA out_labels = "/kaggle/malware/scratchpad/submission_fake.csv" # instantiate file system interface tf = TrainFiles('/kaggle/malware/scratchpad/train/1dlbp', '/kaggle/malware/scratchpad/test/1dlbp', "/kaggle/malware/trainLabels.csv") # read in our data X_train, Y_train, X_test, Y_test = tf.prepare_inputs() sl = SKSupervisedLearning(svm.SVC, X_train, Y_train, X_test, Y_test) sl.fit_standard_scaler() def test_fit_svm(): # start fitting sl.train_params = {'probability': True, 'C': 100, 'gamma': 0.1} print "Starting: ", time_now_str() # logloss is the score tscore, valscore = sl.fit_and_validate() print "Finished: ", time_now_str()
from tr_utils import time_now_str import sklearn.svm as svm import numpy as np from train_files import TrainFiles from SupervisedLearning import SKSupervisedLearning from sklearn.decomposition import SparsePCA, PCA from train_nn import train, createDataSets from sklearn.naive_bayes import MultinomialNB from sklearn.lda import LDA from sklearn.preprocessing import normalize tf = TrainFiles('/kaggle/malware/scratchpad/text/train/instr_freq', '/kaggle/malware/scratchpad/text/test/instr_freq', "/kaggle/malware/trainLabels.csv") tf1 = TrainFiles('/kaggle/malware/scratchpad/train/1dlbp', '/kaggle/malware/scratchpad/test/1dlbp', "/kaggle/malware/trainLabels.csv") X_train, Y_train, X_test, Y_test = tf1.prepare_inputs() n_components = 300 pca = PCA(n_components = n_components) pca.fit(np.r_[X_train, X_test]) #n_components = np.where(np.cumsum(pca.explained_variance_ratio_) >= 0.99)[0][0] #print n_components #pca = PCA(n_components = n_components) #pca.fit(np.r_[X_train, X_test]) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test)
def predict(): tf = TrainFiles('/kaggle/malware/train/mix_lbp', val_path = '/kaggle/malware/test/mix_lbp', labels_file = "/kaggle/malware/trainLabels.csv") X_train, Y_train, X_test, Y_test = tf.prepare_inputs() sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test) sl_svm.fit_standard_scaler() sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True} print "Starting SVM: ", time_now_str() _, ll_svm = sl_svm.fit_and_validate() print "SVM score: {0:.4f}".format(ll_svm if not prediction else _) print "Finished training SVM: ", time_now_str() # neural net print "Starting NN: ", time_now_str() trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based = True) tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled) fnn = predict_nn(trndata) proba_nn = fnn.activateOnDataset(tstdata) print "Finished training NN: ", time_now_str() # no validation labels on actual prediction if doTrees: # random forest sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train, Y_train, X_test, Y_test) sl_ccrf.train_params = \ {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10} sl_ccrf.fit_standard_scaler() print "Starting on RF: ", time_now_str() ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate() print "RF score: {0:.4f}".format(ll_ccrf_tst if not prediction else ll_ccrf_trn) sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob") sl_svm.proba_test.tofile("/temp/sl_svm.prob") proba_nn.tofile("/temp/nn.prob") print "Finished training RF: ", time_now_str() if prediction: proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn], [2./3., 1./6., 1./3.]) out_labels = "/kaggle/malware/submission33.csv" task_labels = "/kaggle/malware/testLabels.csv" labels = [path.splitext(t)[0] for t in tf.get_val_inputs()] out = write_to_csv(task_labels, labels, proba, out_labels) else: # visualize the decision surface, projected down to the first # two principal components of the dataset pca = PCA(n_components=2).fit(sl_svm.X_train_scaled) X = pca.transform(sl_svm.X_train_scaled) x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1) y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1) xx, yy = np.meshgrid(x, y) # title for the plots titles = ['SVC with rbf kernel', 'Random Forest \n' 'n_components=7500', 'Decision Trees \n' 'n_components=7500'] #plt.tight_layout() plt.figure(figsize=(12, 5)) # predict and plot for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)): # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. plt.subplot(1, 3, i + 1) clf.fit(X, Y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.axis('off') # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired) plt.title(titles[i]) plt.tight_layout() plt.show()
i += 1 hist = append_to_arr(hist, file_hist) outs = append_to_arr(outs, out_file) if i == writeBatch: i = 0 first = True for j in range(0, outs.shape[0]): hist[j].tofile(outs[j]) hist = np.array([]) outs = np.array([]) print "==============Done===================" print "Elapsed: ", timer() - start print "Writing......." for i in range(0, outs.shape[0]): hist[i].tofile(outs[i]) print "==============Done===================" neighborhood = 4 get_1dlbp_features(neighborhood) tf = TrainFiles(out_path, labels_file='/kaggle/retina/trainLabels.csv', test_size = 0.0) X, Y, _, _ = tf.prepare_inputs() tf.dump_to_csv(path.join(root_path, '1dlbp.csv'), X, Y)
i += 1 hist = append_to_arr(hist, file_hist) outs = append_to_arr(outs, out_file) if i == writeBatch: i = 0 first = True for j in range(0, outs.shape[0]): hist[j].tofile(outs[j]) hist = np.array([]) outs = np.array([]) print "==============Done===================" print "Elapsed: ", timer() - start print "Writing......." for i in range(0, outs.shape[0]): hist[i].tofile(outs[i]) print "==============Done===================" neighborhood = 4 get_1dlbp_features(neighborhood) tf = TrainFiles(out_path, labels_file='/kaggle/retina/trainLabels.csv', test_size=0.0) X, Y, _, _ = tf.prepare_inputs() tf.dump_to_csv(path.join(root_path, '1dlbp.csv'), X, Y)
from SupervisedLearning import SKSupervisedLearning from train_files import TrainFiles from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.metrics import log_loss, confusion_matrix from sklearn.calibration import CalibratedClassifierCV from tr_utils import vote import matplotlib.pylab as plt from train_nn import createDataSets, train train_path_mix = "/kaggle/malware/mix_lbp.csv" labels_file = "/kaggle/malware/trainLabels.csv" X, Y_train, Xt, Y_test = TrainFiles.from_csv(train_path_mix) def plot_confusion(sl): conf_mat = confusion_matrix(sl.Y_test, sl.clf.predict(sl.X_test_scaled)).astype(dtype='float') norm_conf_mat = conf_mat / conf_mat.sum(axis = 1)[:, None] fig = plt.figure() plt.clf() ax = fig.add_subplot(111) ax.set_aspect(1) res = ax.imshow(norm_conf_mat, cmap=plt.cm.jet, interpolation='nearest') cb = fig.colorbar(res) labs = np.unique(Y_test) x = labs - 1 plt.xticks(x, labs) plt.yticks(x, labs)
def predict(): tf = TrainFiles('/kaggle/malware/train/mix_lbp', val_path='/kaggle/malware/test/mix_lbp', labels_file="/kaggle/malware/trainLabels.csv") X_train, Y_train, X_test, Y_test = tf.prepare_inputs() sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test) sl_svm.fit_standard_scaler() sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True} print "Starting SVM: ", time_now_str() _, ll_svm = sl_svm.fit_and_validate() print "SVM score: {0:.4f}".format(ll_svm if not prediction else _) print "Finished training SVM: ", time_now_str() # neural net print "Starting NN: ", time_now_str() trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based=True) tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled) fnn = predict_nn(trndata) proba_nn = fnn.activateOnDataset(tstdata) print "Finished training NN: ", time_now_str() # no validation labels on actual prediction if doTrees: # random forest sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train, Y_train, X_test, Y_test) sl_ccrf.train_params = \ {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10} sl_ccrf.fit_standard_scaler() print "Starting on RF: ", time_now_str() ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate() print "RF score: {0:.4f}".format( ll_ccrf_tst if not prediction else ll_ccrf_trn) sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob") sl_svm.proba_test.tofile("/temp/sl_svm.prob") proba_nn.tofile("/temp/nn.prob") print "Finished training RF: ", time_now_str() if prediction: proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn], [2. / 3., 1. / 6., 1. / 3.]) out_labels = "/kaggle/malware/submission33.csv" task_labels = "/kaggle/malware/testLabels.csv" labels = [path.splitext(t)[0] for t in tf.get_val_inputs()] out = write_to_csv(task_labels, labels, proba, out_labels) else: # visualize the decision surface, projected down to the first # two principal components of the dataset pca = PCA(n_components=2).fit(sl_svm.X_train_scaled) X = pca.transform(sl_svm.X_train_scaled) x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1) y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1) xx, yy = np.meshgrid(x, y) # title for the plots titles = [ 'SVC with rbf kernel', 'Random Forest \n' 'n_components=7500', 'Decision Trees \n' 'n_components=7500' ] #plt.tight_layout() plt.figure(figsize=(12, 5)) # predict and plot for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)): # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. plt.subplot(1, 3, i + 1) clf.fit(X, Y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.axis('off') # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired) plt.title(titles[i]) plt.tight_layout() plt.show()