Exemplo n.º 1
0
def pickle_syscalls():
    mat,key,cats,_   = classify.extract_feats([syscalls], 'train')
    mat = np.asarray(mat.todense())
    matrix_train = open('matrix_train', 'wb')
    pickle.dump((mat,key,cats),matrix_train)

    test_mat,_,_,ids = classify.extract_feats([syscalls], direc='test',
                                              global_feat_dict = key)
    test_mat = np.asarray(test_mat.todense())
    matrix_test = open('matrix_test', 'wb')
    pickle.dump((test_mat,ids),matrix_test)
Exemplo n.º 2
0
def example_structure_plot():
    mat,key,cats,_ = classify.extract_feats([structure], 'train')
    for i in range(mat.shape[0]):
        color = 'red'
        if cats[i] == 8:
            color = 'black'
        plt.scatter([mat[i,key['num_processes']]], [cats[i]], c=color)
    plt.show()
Exemplo n.º 3
0
def get_stats(key_name):
    mat,key,cats,ids = classify.extract_feats([structure], 'train')
    mat_len = mat.shape[0]
    sums =  np.zeros((NUM_MALEWARE))
    counts =  np.zeros((NUM_MALEWARE))
    means = np.zeros((NUM_MALEWARE))
    var = np.zeros((NUM_MALEWARE))
    for i in range(mat_len):
        sums[cats[i]] += mat[i,key[key_name]]
        counts[cats[i]] += 1
    for i in range(NUM_MALEWARE):
        means[i] = sums[i]/counts[i]
    for i in range(mat_len):
        var[cats[i]] += (mat[i,key[key_name]] - means[cats[i]])**2

    for i in range(NUM_MALEWARE):
        var[i] = var[i]/counts[i]
    std = np.zeros((NUM_MALEWARE))
    std = map(math.sqrt,var)
    return (means,var,std)
Exemplo n.º 4
0
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print "USAGE: create_sqlite_db.py [db_name] [num_data_points]"
        quit()
    if len(sys.argv) == 3:
        classification_starter.TOTAL_NUM_DATA = int(sys.argv[2])
        print "Using at most", classification_starter.TOTAL_NUM_DATA, "data points..."
    dbname = sys.argv[1]
    if dbname[-3:] != ".db":
        dbname = dbname + ".db"

    print "creating database..."
    conn = sqlite3.connect(dbname)
    c = conn.cursor()

    # extract features
    ffs = features.ALL_FEATURES
    print "extracting training features..."
    train_dir = "train"
    X_train, global_feat_dict, t_train, train_ids = classification_starter.extract_feats(
        ffs, train_dir)

    print "creating table..."
    create_table(conn, c, global_feat_dict)
    print "writing data..."
    write_data(conn, c, X_train, global_feat_dict, t_train, train_ids)
    print "creating index..."
    create_index(conn, c)
    print "done!"
Exemplo n.º 5
0
def loadData(params, withhold, ffs, trainDir="train", testDir="test"):
    """
    loads the movie data

    arguments:
        params      : dict with several keys:
            load        : loading mode; either: 'extract' to load from
                        `params['extractFile']`, 'split' to load from
                        `params['splitFile']`, or None to extract features and
                        save to `params['extractFile']` and/or
                        `params['splitFile']`.
            extractFile : file to load/save extracted features to/from,
                        depending on loading mode
            splitFile   : file to load/save split data to/from,
                        depending on loading mode
            loadTest    : True to load the test data (when withold=0), False to return empty arrays

        withhold    : number of data points to withhold for cross-validation; if 0 and
                    `params['loadTest']` is True, then the test data will be loaded and returned
        ffs         : list of feature functions
        trainDir   : path to training file (train.xml)
        testDir    : path to test cases file


    returns:
        X_train ,y_train,train_ids, X_test,y_test,test_ids

    """
    # load data from `params['splitFile']`
    if params['load'] == 'split':
        X_train, y_train, train_ids, X_test, y_test, test_ids = unpickle(
            params['splitFile'])
        print "loaded %d training fds" % len(train_ids)
        print "withheld %d fds for testing" % len(test_ids)
    else:
        # load data from scratch
        if params['load'] == None:
            fds, targets, train_ids = classif.extract_feats_helper(
                ffs, trainDir)
            print "loaded %d fds" % len(fds)
            if params['extractFile'] != None:
                pickle((fds, targets, train_ids), params['extractFile'])
        # load data from `params['extractFile']`, but split it anew
        elif params['load'] == 'extract':
            fds, targets, train_ids = unpickle(params['extractFile'])
            print "loaded %d fds" % len(fds)

        # load the test data from the testcases file
        if withhold == 0:
            print "Extracting test data features"
            X_train, feat_dict = classif.make_design_mat(fds)
            y_train = np.array(targets)
            train_ids = []

            if params['loadTest']:
                X_test, _, y_test, test_ids = classif.extract_feats(
                    ffs, testDir, global_feat_dict=feat_dict)
            else:
                X_test, y_test, test_ids = np.empty(shape=(0, 0)), np.empty(
                    shape=(0, 0)), []
            print "Done"
        # withhold some of the training data into test data
        else:
            fds, targets, train_ids, fdsTest, targetsTest, test_ids = splitData(
                fds, targets, train_ids, withhold, params['splitMethod'])
            X_train, feat_dict = classif.make_design_mat(fds)
            X_test, _ = classif.make_design_mat(fdsTest, feat_dict)
            y_train = np.array(targets)
            y_test = np.array(targetsTest)

        if params['splitFile'] != None:
            print "Writing split file..."
            pickle((X_train, y_train, train_ids, X_test, y_test, test_ids),
                   params['splitFile'])
            print "Done writing split file"

    return X_train, y_train, train_ids, X_test, y_test, test_ids
Exemplo n.º 6
0
def loadData(params, withhold, ffs, trainDir="train", testDir="test"):
    """
    loads the movie data

    arguments:
        params      : dict with several keys:
            load        : loading mode; either: 'extract' to load from
                        `params['extractFile']`, 'split' to load from
                        `params['splitFile']`, or None to extract features and
                        save to `params['extractFile']` and/or
                        `params['splitFile']`.
            extractFile : file to load/save extracted features to/from,
                        depending on loading mode
            splitFile   : file to load/save split data to/from,
                        depending on loading mode
            loadTest    : True to load the test data (when withold=0), False to return empty arrays

        withhold    : number of data points to withhold for cross-validation; if 0 and
                    `params['loadTest']` is True, then the test data will be loaded and returned
        ffs         : list of feature functions
        trainDir   : path to training file (train.xml)
        testDir    : path to test cases file


    returns:
        X_train ,y_train,train_ids, X_test,y_test,test_ids

    """
    # load data from `params['splitFile']`
    if params['load']=='split':
        X_train, y_train, train_ids,X_test,y_test,test_ids = unpickle(params['splitFile'])
        print "loaded %d training fds" % len(train_ids)
        print "withheld %d fds for testing" % len(test_ids)
    else:
        # load data from scratch
        if params['load']==None:
            fds, targets, train_ids = classif.extract_feats_helper(ffs, trainDir)
            print "loaded %d fds" % len(fds)
            if params['extractFile'] != None:
                pickle((fds,targets,train_ids),params['extractFile'])
        # load data from `params['extractFile']`, but split it anew
        elif params['load']=='extract':
            fds,targets,train_ids=unpickle(params['extractFile'])
            print "loaded %d fds" % len(fds)

        # load the test data from the testcases file
        if withhold==0:
            print "Extracting test data features"
            X_train,feat_dict = classif.make_design_mat(fds)
            y_train=np.array(targets)
            train_ids = []

            if params['loadTest']:
                X_test,_,y_test,test_ids = classif.extract_feats(ffs, testDir, global_feat_dict=feat_dict)
            else:
                X_test, y_test, test_ids = np.empty(shape=(0,0)), np.empty(shape=(0,0)), []
            print "Done"
        # withhold some of the training data into test data
        else:
            fds, targets, train_ids, fdsTest, targetsTest, test_ids = splitData(fds, targets, train_ids, withhold, params['splitMethod'])
            X_train,feat_dict = classif.make_design_mat(fds)
            X_test,_ = classif.make_design_mat(fdsTest, feat_dict)
            y_train=np.array(targets)
            y_test=np.array(targetsTest)

        if params['splitFile'] != None:
            print "Writing split file..."
            pickle((X_train, y_train, train_ids,X_test,y_test,test_ids), params['splitFile'])
            print"Done writing split file"

    return X_train,y_train,train_ids, X_test,y_test,test_ids
Exemplo n.º 7
0
        print "USAGE: create_sqlite_db.py [db_name] [num_data_points]"
        quit()
    if len(sys.argv) == 3:
        classification_starter.TOTAL_NUM_DATA = int(sys.argv[2])
        print "Using at most",classification_starter.TOTAL_NUM_DATA,"data points..."
    dbname = sys.argv[1]
    if dbname[-3:] != ".db":
        dbname = dbname + ".db"

    print "creating database..."
    conn = sqlite3.connect(dbname)
    c = conn.cursor()

    # extract features
    ffs = features.ALL_FEATURES
    print "extracting training features..."
    train_dir = "train"
    X_train,global_feat_dict,t_train,train_ids = classification_starter.extract_feats(ffs, train_dir)

    print "creating table..."
    create_table(conn, c, global_feat_dict)
    print "writing data..."
    write_data(conn, c, X_train, global_feat_dict, t_train, train_ids)
    print "creating index..."
    create_index(conn, c)
    print "done!"