示例#1
0
def tianchi_api():
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=500)

    white = pd.read_csv(os.path.join("features",
                                     "white.csv"))[['file_name', 'api_name']]
    black = load_df(os.path.join("features", "black"))
    test = load_df(os.path.join("features", "test"))
    tianchi = pd.read_csv("security_train.csv").rename(
        columns={"file_id": "file_name"})

    full = pd.concat([white, black, test])
    full_str = to_str(full)

    print(1)
    api_vec.fit(full_str)

    print(2)

    black_output, name_list = to_str(tianchi, mode=1, column_name="api")
    save_dict(name_list, os.path.join("features", "tianchi_name_list"))
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz(os.path.join("features", "tianchi.npz"),
                          black_output)
示例#2
0
def stage2_api(feature_num=500):
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=feature_num)

    white = pd.read_csv(os.path.join("features",
                                     "white.csv"))[['file_name', 'api_name']]
    black = load_df(os.path.join("features", "black"))
    test = load_df(os.path.join("features", "test"))
    stage2 = load_df(os.path.join("features", "stage2"))

    full = pd.concat([white, black, test])
    full_str = to_str(full)

    print(1)
    api_vec.fit(full_str)

    print(2)

    black_output, name_list = to_str(stage2, mode=1)
    save_dict(name_list,
              os.path.join("features", "stage2_name_list" + str(feature_num)))
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz(
        os.path.join("features", "stage2" + str(feature_num) + ".npz"),
        black_output)
示例#3
0
def api():
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=500)

    white = pd.read_csv("white.csv")[['file_name', 'api_name']]
    black = load_df("black")
    test = load_df("test")

    full = pd.concat([white, black, test])
    full_str = to_str(full)

    print(1)
    api_vec.fit(full_str)

    print(2)
    black_output, name_list = to_str(black, mode=1)
    save_dict(name_list, "black_name_list")
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz("black.npz", black_output)

    white_output, name_list = to_str(white, mode=1)
    save_dict(name_list, "white_name_list")
    white_output = api_vec.transform(white_output)
    scipy.sparse.save_npz("white.npz", white_output)

    test_str, name_list = to_str(test, mode=1)
    save_dict(name_list, "test_name_list")
    test_output = api_vec.transform(test_str)
    scipy.sparse.save_npz("test.npz", test_output)
示例#4
0
def stage2_api_new(feature_num=500):
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=feature_num)

    stage2 = load_df(os.path.join("features", "stage2"))

    black_output, name_list = to_str(stage2, mode=1)

    print(1)
    api_vec.fit(black_output)

    print(2)

    # black_output, name_list = to_str(stage2, mode=1)
    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "stage2_name_list" + str(feature_num)))
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "stage2" + str(feature_num) + ".npz"), black_output)
示例#5
0
def stage_2_attribute(suffix="_dll",
                      use_less_value=False,
                      type_name="",
                      map_func=None,
                      max_feature=1000):
    stage2 = load_df(os.path.join("features", "stage2" + suffix), mode=1)

    if use_less_value:
        if map_func is None:
            stage2["value"] = stage2["value"].map(lambda x: x.split("\\")[-1])
        else:
            stage2["value"] = stage2["value"].map(lambda x: map_func(x))
    stage2_output, name_list = to_str(stage2, mode=1, column_name="value")
    api_vec, _ = train_tf_idf(suffix="_dll",
                              use_less_value=use_less_value,
                              map_func=map_func,
                              data=stage2_output,
                              max_feature=max_feature)

    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "stage2_name_list" + suffix + type_name))
    stage2_output = api_vec.transform(stage2_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "stage2" + suffix + type_name + ".npz"), stage2_output)
示例#6
0
def train_tf_idf(suffix="_dll",
                 use_less_value=False,
                 map_func=None,
                 max_feature=2000,
                 data=None):
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=max_feature)

    if data is None:
        white = load_df(os.path.join(get_root_path(), "features",
                                     "white" + suffix),
                        mode=1)
        black = load_df(os.path.join(get_root_path(), "features",
                                     "black" + suffix),
                        mode=1)
        test = load_df(os.path.join(get_root_path(), "features",
                                    "test" + suffix),
                       mode=1)

        if use_less_value:
            if map_func is None:
                for i in [white, black, test]:
                    i["value"] = i["value"].map(lambda x: x.split("\\")[-1])
            else:
                for i in [white, black, test]:
                    i["value"] = i["value"].map(lambda x: map_func(x))

        full = pd.concat([white, black, test])
        full_str = to_str(full, column_name="value")
    else:
        full_str = data

    print(1)
    api_vec.fit(full_str)
    print(2)
    if data is None:
        return api_vec, [white, black, test]
    else:
        return api_vec, None
示例#7
0
 def inter(load_name):
     white = load_df(load_name, mode=1)[[
         'file_name', 'api_name', 'ret_value', 'call_time'
     ]]
     white.rename(columns={
         'ret_value': 'return_value',
         'api_name': 'api',
         'file_name': 'file_id',
         'call_time': 'index'
     },
                  inplace=True)
     return white
示例#8
0
def extract_features(load_name, extract_function):
    white = load_df(load_name, mode=1)[[
        'file_name', 'api_name', 'call_name', 'call_pid', 'ret_value',
        'apiArg_list_count', 'exInfo_list_count', 'call_time'
    ]]
    white.rename(columns={
        'ret_value': 'return_value',
        'api_name': 'api',
        'file_name': 'file_id',
        'call_time': 'index'
    },
                 inplace=True)
    a = extract_function(white)
    a.rename(columns={'file_id': 'file_name'}, inplace=True)
    return a
示例#9
0
from sklearn.model_selection import train_test_split

from basic_function import load_dict, load_df, get_root_path, save_dict
from metrics import com_acc
from model import get_model
from shorten_api_list import delete_repeat_pattern, delete_same_pattern
import pickle

shape = (512, 64)
input_dim = 92 + 1
batch_size = 32
epochs = 50
class_num = 2

api_dict = load_dict(os.path.join(get_root_path(), "features", "api_dict.txt"))
white = load_df(os.path.join(get_root_path(), "features", "white"),
                mode=1)[['file_name', 'api_name', 'call_time']]
black = load_df(os.path.join(get_root_path(), "features", "black"),
                mode=1)[['file_name', 'api_name', 'call_time']]

white_label = np.zeros(white.shape[0])
black_label = np.ones(black.shape[0])

full = pd.concat([white, black], sort=False)
label = np.concatenate((white_label, black_label))
full['label'] = label

full['api_name'] = full['api_name'].map(api_dict)

# full = load_df(os.path.join(get_root_path(), "features", "stage2"), mode=1)[['file_name', 'api_name', 'call_time']]
full['label'] = np.zeros((full.shape[0], ))
full['api_name'] = full['api_name'].map(api_dict)
示例#10
0
from basic_function import load_df, save_dict
import pandas as pd

full = load_df("../features/stage2")

# full.fillna("None", inplace)
api_list = set(full['api_name'])
print(len(api_list))
api_dict = dict(zip(api_list, range(len(api_list))))
save_dict(api_dict, "../features/api_dict.txt")