예제 #1
0
def load_ember_dataset():
    """ Return train and test data from EMBER.

    :return: (array, array, array, array)
    """

    # Perform feature vectorization only if necessary.
    try:
        x_train, y_train, x_test, y_test = ember.read_vectorized_features(
            constants.EMBER_DATA_DIR, feature_version=1)

    except:
        ember.create_vectorized_features(constants.EMBER_DATA_DIR,
                                         feature_version=1)
        x_train, y_train, x_test, y_test = ember.read_vectorized_features(
            constants.EMBER_DATA_DIR, feature_version=1)

    x_train = x_train.astype(dtype='float64')
    x_test = x_test.astype(dtype='float64')

    # Get rid of unknown labels
    x_train = x_train[y_train != -1]
    y_train = y_train[y_train != -1]
    x_test = x_test[y_test != -1]
    y_test = y_test[y_test != -1]

    return x_train, y_train, x_test, y_test
예제 #2
0
def main():
    prog = "train_ember"
    descr = "Train an ember model from a directory with raw feature files"
    parser = argparse.ArgumentParser(prog=prog, description=descr)
    parser.add_argument("datadir", metavar="DATADIR", type=str, help="Directory with raw features")
    args = parser.parse_args()

    if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir):
        parser.error("{} is not a directory with raw feature files".format(args.datadir))

    X_train_path = os.path.join(args.datadir, "X_train.dat")
    y_train_path = os.path.join(args.datadir, "y_train.dat")
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("Creating vectorized features")
        ember.create_vectorized_features(args.datadir)

    X_train_path = os.path.join(args.datadir, "X_train_vboat.dat")
    y_train_path = os.path.join(args.datadir, "y_train_vboat.dat")
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("Creating vectorized features for vboat")

        print(ember.__file__)
        ember.create_vectorized_features_vboat(args.datadir)
    print("Training LightGBM model")

    lgbm_model = ember.train_model_vboat(args.datadir, 50)
    lgbm_model.save_model(os.path.join(args.datadir, "model_vboat.txt"))
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--datadir", help="Features Directory", type=str)
    parser.add_argument("-o", "--output", help="output Directory", type=str)
    args = parser.parse_args()

    if not os.path.exists(args.datadir):
        parser.error("{} is not a directory".format(args.datadir))
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    #Get total lines from feature.jsonl
    rows = 0
    with jsonlines.open(os.path.join(args.datadir,
                                     'features.jsonl')) as reader:
        for obj in reader.iter(type=dict, skip_invalid=True):
            rows += 1

    clear(args.datadir)
    ember.create_vectorized_features(args.datadir, rows)

    # Train and save model
    print("Training LightGBM model")
    lgbm_model = ember.train_model(args.datadir, rows)
    lgbm_model.save_model(os.path.join(args.output, "model.txt"))
예제 #4
0
def main():
    prog = "train_ember"
    descr = "Train an ember model from a directory with raw feature files"
    parser = argparse.ArgumentParser(prog=prog, description=descr)
    parser.add_argument("-v",
                        "--featureversion",
                        type=int,
                        default=2,
                        help="EMBER feature version")
    parser.add_argument("-m",
                        "--metadata",
                        action="store_true",
                        help="Create metadata CSVs")
    parser.add_argument("-t",
                        "--train",
                        action="store_true",
                        help="Train an EMBER model")
    parser.add_argument("datadir",
                        metavar="DATADIR",
                        type=str,
                        help="Directory with raw features")
    parser.add_argument("--optimize",
                        help="gridsearch to find best parameters",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir):
        parser.error("{} is not a directory with raw feature files".format(
            args.datadir))

    X_train_path = os.path.join(args.datadir, "X_train.dat")
    y_train_path = os.path.join(args.datadir, "y_train.dat")
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("Creating vectorized features")
        ember.create_vectorized_features(args.datadir, args.featureversion)

    if args.metadata:
        ember.create_metadata(args.datadir)

    if args.train:
        params = {
            "boosting": "gbdt",
            "objective": "binary",
            "num_iterations": 1000,
            "learning_rate": 0.05,
            "num_leaves": 2048,
            "max_depth": 15,
            "min_data_in_leaf": 50,
            "feature_fraction": 0.5
        }
        if args.optimize:
            params = ember.optimize_model(args.datadir)
            print("Best parameters: ")
            print(json.dumps(params, indent=2))

        print("Training LightGBM model")
        lgbm_model = ember.train_model(args.datadir, params,
                                       args.featureversion)
        lgbm_model.save_model(os.path.join(args.datadir, "model.txt"))
예제 #5
0
 def vectorize(self):
     # To do Error check 
     # if file is jsonl file
     if self.rows == 0:
         #logger.info('[Error] Please check if jsonl file is empty ...')
         return -1
     
     ember.create_vectorized_features(self.jsonlpath, self.output, self.rows, self.features, self.dim)
    def createVectorizedFeatures(self):
        if not os.path.exists(self.outDir) or not os.path.isdir(self.outDir):
            print("{} is not a directory with raw feature files".format(
                self.outDir))
        X_train_path = os.path.join(self.outDir, "X_train.dat")
        y_train_path = os.path.join(self.outDir, "y_train.dat")

        if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
            print("Creating vectorized features")
            ember.create_vectorized_features(self.outDir)
        else:
            print("Vectorized features (.dat files) are already created")
예제 #7
0
def init_vectorized_features(dataset_dir: str):
    """
    Required for the generation of '.dat' data files

    :param dataset_dir: directory to the base location of the dataset
    :return:
    """
    try:
        assert(os.path.exists(dataset_dir))

        ember.create_vectorized_features(dataset_dir, 1)
    except AssertionError:
        raise Exception(
            "[ASSERTION ERROR] The path to base directory of dataset provided does not exist"
        )
예제 #8
0
def main():
    datadir = '/home/mira/research/dataset/ember.2'

    if not os.path.exists(datadir) or not os.path.isdir(datadir):
        print("not a path")

    X_train_path = os.path.join(datadir, "X_train.dat")
    y_train_path = os.path.join(datadir, "y_train.dat")
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("[{}] Creating vectorized features".format(
            datetime.datetime.now()))
        ember.create_vectorized_features(datadir)

    print("[{}] Training LightGBM model".format(datetime.datetime.now()))
    lgbm_model = ember.train_model(datadir)
    lgbm_model.save_model(os.path.join(datadir, "model.txt"))
    print("[{}] Done".format(datetime.datetime.now()))
예제 #9
0
파일: main.py 프로젝트: SYHPARK/maldetect
# from tensorflow import keras
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Input, Dense, Dropout
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.python.keras.utils import to_categorical

datadir = './data/ember2017_1/'

# In[ ]:

# create vectorized features
X_train_path = os.path.join(datadir, "X_train.dat")
y_train_path = os.path.join(datadir, "y_train.dat")
if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
    print("[*] Creating vectorized features")
    ember.create_vectorized_features(datadir, 1)

# In[ ]:

print("[*] training: read vectorized features")
x_train, y_train = ember.read_vectorized_features(datadir, "train", 1)

# In[ ]:

print("[*] testing: read vectorized features")
x_test, y_test = ember.read_vectorized_features(datadir, "test", 1)

# In[ ]:

train_rows = y_train != -1
print(train_rows.size)
예제 #10
0
import ember
import os
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import numpy as np

data_dir = "/home/cuckoo/Desktop/ember/ember2018/"
feature_version = 2

if not (os.path.exists(os.path.join(
        data_dir, f"X_train_{feature_version}.dat")) and os.path.exists(
            os.path.join(data_dir, f"y_train_{feature_version}.dat"))):
    print("Creating vectorized features")
    ember.create_vectorized_features(data_dir, feature_version=feature_version)

#_ = ember.create_metadata(data_dir)

#emberdf = ember.read_metadata(data_dir)
X_test, y_test = ember.read_vectorized_features(
    data_dir, subset="test", feature_version=feature_version)
#X_train, y_train = ember.read_vectorized_features(data_dir, subset="train", feature_version=3)
with open(os.path.join(data_dir, f"SGDR_model_{feature_version}.pkl"),
          'rb') as f:
    model = pickle.load(f)
    y_test_pred = model.predict(X_test)

print("ROC AUC:", roc_auc_score(y_test, y_test_pred))
예제 #11
0
y_train.dat and y_test.dat
"""

import argparse
from sys import argv

import ember


def parse_arguments(argv):
    """Parse command line arguments."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir',
                        dest='data_dir',
                        type=str,
                        default='data',
                        help='Path to data directory.')
    parser.add_argument('--scale',
                        dest='scale',
                        type=float,
                        default=1.,
                        help='Scale of training/test dataset.')
    return parser.parse_args(argv)


# Parse arguments
args = parse_arguments(argv[1:])
data_dir = args.data_dir

ember.create_vectorized_features(data_dir, scale=args.scale)
예제 #12
0
파일: training.py 프로젝트: 6r0k3d/ml-intro
def main():
    debug = True
    prog = "train_ember"
    descr = "Train an ember model from a directory with raw feature files"
    parser = argparse.ArgumentParser(prog=prog, description=descr)
    parser.add_argument("datadir",
                        metavar="DATADIR",
                        type=str,
                        help="Directory with raw features")
    args = parser.parse_args()

    # If model data doesn't exist yet, create it from raw features
    if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir):
        parser.error("{} is not a directory with raw feature files".format(
            args.datadir))

    X_train_path = os.path.join(args.datadir, "X_train.dat")
    y_train_path = os.path.join(args.datadir, "y_train.dat")
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("Creating vectorized features")
        ember.create_vectorized_features(args.datadir)

    # Get training and testing data
    X_train, y_train, X_test, y_test = ember.read_vectorized_features(
        args.datadir)

    if debug:
        print("X_train shape: ", X_train.shape)
        print("y_train shape: ", y_train.shape)
        print("X_test shape: ", X_test.shape)
        print("y_test shape: ", y_test.shape)

    # Convert memmap to pandas series for metrics
    y_test = pandas.Series(data=y_test)

    # Decision Tree Learner
    from sklearn import tree
    tree_clf = tree.DecisionTreeClassifier(max_depth=5)
    """
    tree_model_path = os.path.join(args.datadir, "tree_model.p")
    # Train model if it doesn't exist
    if not (os.path.exists(tree_model_path)):
        print("Training model")
        tree_clf.fit(X_train, y_train)
        pickle.dump(tree_clf, open("tree_model.p","wb"))

    saved_tree_clf = pickle.load(open("tree_model.p", "rb"))
    """

    tree_clf.fit(X_train, y_train)
    tree_dot = tree.export_graphviz(tree_clf, out_file=None)
    graph = graphviz.Source(tree_dot)
    graph.render("tree")

    y_pred = tree_clf.predict(X_test)

    print("\n##### Metrics #####\n")
    print("Accuracy Score")
    print(metrics.accuracy_score(y_test, y_pred), "\n")

    print("Class distribution\n", y_test.value_counts(), "\n")
    print("Average Malware: ", y_test.mean())
    print("Average Benign: ", 1 - y_test.mean())
    print("Null Accuracy: ", max(y_test.mean(), 1 - y_test.mean()), "\n")

    print("Confusion Matrix")
    print("[[TN   FP]\n [FN   TP]]\n")
    confusion = metrics.confusion_matrix(y_test, y_pred)
    print(confusion, "\n")

    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]

    print("Accuracy: how often is the classifier right?")
    print("Accuracy from Confusion Matrix")
    print("(TP + TN) / float(TP + TN + FP + FN)")
    print((TP + TN) / float(TP + TN + FP + FN), "\n")

    print("Classification Error: How often is the classifier wrong?")
    print("(1 - metrics.accuracy_score(y_test, y_pred)")
    print((1 - metrics.accuracy_score(y_test, y_pred)), "\n")

    print(
        "Sensitivity: When the actual value is positive,\n how often is the prediction right?"
    )
    print("Also call 'recall'")
    print("TP / float(TP + FN)")
    print((TP / float(TP + FN)), "\n")
    print(metrics.recall_score(y_test, y_pred))

    print(
        "Specificity: When value is negative, how often is the prediction right?"
    )
    print("TN / float(TN + FP)")
    print(TN / float(TN + FP), "\n")

    print(
        "False Pos. Rate: When the actual value is negative,\n how often is the prediction wrong?"
    )
    print("FP / float(TN + FP)")
    print(FP / float(TN + FP), "\n")

    print(
        "Precision: When a positive value is predicted,\n how often is the prediction right?"
    )
    print("TP / float(TP + FP)")
    print(TP / float(TP + FP), "\n")

    # Classification threshold
    # MUST USE y_pred_prod with the positive class!!!
    y_pred_prob = tree_clf.predict_proba(X_test)[:, 1]

    # ROC: Choose a threshold that balances sensitivity and specificity
    # Ideal plot hugs top left of graph: high sensitivity and high specificity
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
    plt.plot(fpr, tpr)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.title('ROC curve for malware classifier')
    plt.xlabel('False Positive Rate (1 - Specificity)')
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.grid(True)
    plt.show()

    evaluate_threshold(tpr, fpr, thresholds, 0.5)

    # AUC: percentage of ROC plot that is under the curve
    # Higher AUC indicates top left graph ROC
    # Useful for imbalanced classes
    print("AUC")
    print(metrics.roc_auc_score(y_test, y_pred_prob), "\n")
예제 #13
0
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 28 12:42:09 2019

@author: Piyush
"""

import ember

ember.create_vectorized_features('D:/ml_projects/datasets/ember')
ember.create_metadata('D:/ml_projects/datasets/ember')
import ember
import h5py

ember_dir = "../data/ember2018/"

ember.create_vectorized_features(ember_dir)
X_train, y_train, X_test, y_test = ember.read_vectorized_features(ember_dir)

with h5py.File("../data/Ember2018.h5", 'w') as f:
    grp_train = f.create_group("train")
    grp_train.create_dataset("data", data=X_train.transpose())
    grp_train.create_dataset("targets", data=y_train)

    grp_test = f.create_group("test")
    grp_test.create_dataset("data", data=X_test.transpose())
    grp_test.create_dataset("targets", data=y_test)
예제 #15
0
def main():
    prog = "train_ember"
    descr = "Train an ember model from a directory with raw feature files"
    parser = argparse.ArgumentParser(prog=prog, description=descr)
    parser.add_argument("--modelname",
                        type=str,
                        default="SGD",
                        help="Model name")
    parser.add_argument("-v",
                        "--featureversion",
                        type=int,
                        default=2,
                        help="EMBER feature version")
    parser.add_argument("datadir",
                        metavar="DATADIR",
                        type=str,
                        help="Directory with raw features")
    parser.add_argument("--optimize",
                        help="gridsearch to find best parameters",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.exists(args.datadir) or not os.path.isdir(args.datadir):
        parser.error("{} is not a directory with raw feature files".format(
            args.datadir))

    X_train_path = os.path.join(args.datadir,
                                f"X_train_{args.featureversion}.dat")
    y_train_path = os.path.join(args.datadir,
                                f"y_train_{args.featureversion}.dat")
    # if they don't exist, compute them.
    if not (os.path.exists(X_train_path) and os.path.exists(y_train_path)):
        print("Creating vectorized features")
        ember.create_vectorized_features(args.datadir, args.featureversion)

    #feature_name = ['feature_' + str(col) for col in range(num_feature)]

    params = {
        "boosting": "gbdt",
        "objective": "regression",
        "num_iterations": 1000,
        "learning_rate": 0.05,
        "num_leaves": 2048,
        "max_depth": 15,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.5,
        "num_threads": 2,
    }
    if args.optimize:
        params = ember.optimize_model(args.datadir)
        print("Best parameters: ")
        print(json.dumps(params, indent=2))

    print("Training Classifier model")
    lgbm_model = ember.train_model(args.datadir, params, args.featureversion)

    # Save to file in the current working directory
    #pkl_filename = os.path.join(args.datadir,f"{args.modelname}_model_{args.featureversion}.pkl")
    # with open(pkl_filename, 'wb') as f:
    #    pickle.dump(lgbm_model, f)
    print(f"file dumped into model.txt .... ")
    lgbm_model.save_model(
        os.path.join(args.datadir, f"model_{args.featureversion}.txt"))

    print('Plotting feature importances...')
    ax = lgb.plot_importance(lgbm_model, max_num_features=10)
    plt.savefig(f'lgbm_importances-0{args.featureversion}.png')

    # run
    os.system(f"xdg-open lgbm_importances-0{args.featureversion}.png")
예제 #16
0
import ember

# ember hard codes the size of data.
# it's best to just let it convert all 1.1M points. This only takes like 30 minutes
# only have to do this once.
# data points are stored in the data directory and are mmaped into memory when training
ember.create_vectorized_features("./data/ember/")
ember.create_metadata("./data/ember/")
예제 #17
0
파일: ember_to_mat.py 프로젝트: e-liner/ODL
def create_vectorized_ember():
    ember.create_vectorized_features("./data/ember2018/")
    ember.create_metadata("./data/ember2018/")