def tianchi_api(): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=500) white = pd.read_csv(os.path.join("features", "white.csv"))[['file_name', 'api_name']] black = load_df(os.path.join("features", "black")) test = load_df(os.path.join("features", "test")) tianchi = pd.read_csv("security_train.csv").rename( columns={"file_id": "file_name"}) full = pd.concat([white, black, test]) full_str = to_str(full) print(1) api_vec.fit(full_str) print(2) black_output, name_list = to_str(tianchi, mode=1, column_name="api") save_dict(name_list, os.path.join("features", "tianchi_name_list")) black_output = api_vec.transform(black_output) scipy.sparse.save_npz(os.path.join("features", "tianchi.npz"), black_output)
def stage2_api(feature_num=500): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=feature_num) white = pd.read_csv(os.path.join("features", "white.csv"))[['file_name', 'api_name']] black = load_df(os.path.join("features", "black")) test = load_df(os.path.join("features", "test")) stage2 = load_df(os.path.join("features", "stage2")) full = pd.concat([white, black, test]) full_str = to_str(full) print(1) api_vec.fit(full_str) print(2) black_output, name_list = to_str(stage2, mode=1) save_dict(name_list, os.path.join("features", "stage2_name_list" + str(feature_num))) black_output = api_vec.transform(black_output) scipy.sparse.save_npz( os.path.join("features", "stage2" + str(feature_num) + ".npz"), black_output)
def api(): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=500) white = pd.read_csv("white.csv")[['file_name', 'api_name']] black = load_df("black") test = load_df("test") full = pd.concat([white, black, test]) full_str = to_str(full) print(1) api_vec.fit(full_str) print(2) black_output, name_list = to_str(black, mode=1) save_dict(name_list, "black_name_list") black_output = api_vec.transform(black_output) scipy.sparse.save_npz("black.npz", black_output) white_output, name_list = to_str(white, mode=1) save_dict(name_list, "white_name_list") white_output = api_vec.transform(white_output) scipy.sparse.save_npz("white.npz", white_output) test_str, name_list = to_str(test, mode=1) save_dict(name_list, "test_name_list") test_output = api_vec.transform(test_str) scipy.sparse.save_npz("test.npz", test_output)
def stage2_api_new(feature_num=500): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=feature_num) stage2 = load_df(os.path.join("features", "stage2")) black_output, name_list = to_str(stage2, mode=1) print(1) api_vec.fit(black_output) print(2) # black_output, name_list = to_str(stage2, mode=1) save_dict( name_list, os.path.join(get_root_path(), "features", "stage2_name_list" + str(feature_num))) black_output = api_vec.transform(black_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "stage2" + str(feature_num) + ".npz"), black_output)
def stage_2_attribute(suffix="_dll", use_less_value=False, type_name="", map_func=None, max_feature=1000): stage2 = load_df(os.path.join("features", "stage2" + suffix), mode=1) if use_less_value: if map_func is None: stage2["value"] = stage2["value"].map(lambda x: x.split("\\")[-1]) else: stage2["value"] = stage2["value"].map(lambda x: map_func(x)) stage2_output, name_list = to_str(stage2, mode=1, column_name="value") api_vec, _ = train_tf_idf(suffix="_dll", use_less_value=use_less_value, map_func=map_func, data=stage2_output, max_feature=max_feature) save_dict( name_list, os.path.join(get_root_path(), "features", "stage2_name_list" + suffix + type_name)) stage2_output = api_vec.transform(stage2_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "stage2" + suffix + type_name + ".npz"), stage2_output)
def train_tf_idf(suffix="_dll", use_less_value=False, map_func=None, max_feature=2000, data=None): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=max_feature) if data is None: white = load_df(os.path.join(get_root_path(), "features", "white" + suffix), mode=1) black = load_df(os.path.join(get_root_path(), "features", "black" + suffix), mode=1) test = load_df(os.path.join(get_root_path(), "features", "test" + suffix), mode=1) if use_less_value: if map_func is None: for i in [white, black, test]: i["value"] = i["value"].map(lambda x: x.split("\\")[-1]) else: for i in [white, black, test]: i["value"] = i["value"].map(lambda x: map_func(x)) full = pd.concat([white, black, test]) full_str = to_str(full, column_name="value") else: full_str = data print(1) api_vec.fit(full_str) print(2) if data is None: return api_vec, [white, black, test] else: return api_vec, None
def inter(load_name): white = load_df(load_name, mode=1)[[ 'file_name', 'api_name', 'ret_value', 'call_time' ]] white.rename(columns={ 'ret_value': 'return_value', 'api_name': 'api', 'file_name': 'file_id', 'call_time': 'index' }, inplace=True) return white
def extract_features(load_name, extract_function): white = load_df(load_name, mode=1)[[ 'file_name', 'api_name', 'call_name', 'call_pid', 'ret_value', 'apiArg_list_count', 'exInfo_list_count', 'call_time' ]] white.rename(columns={ 'ret_value': 'return_value', 'api_name': 'api', 'file_name': 'file_id', 'call_time': 'index' }, inplace=True) a = extract_function(white) a.rename(columns={'file_id': 'file_name'}, inplace=True) return a
from sklearn.model_selection import train_test_split from basic_function import load_dict, load_df, get_root_path, save_dict from metrics import com_acc from model import get_model from shorten_api_list import delete_repeat_pattern, delete_same_pattern import pickle shape = (512, 64) input_dim = 92 + 1 batch_size = 32 epochs = 50 class_num = 2 api_dict = load_dict(os.path.join(get_root_path(), "features", "api_dict.txt")) white = load_df(os.path.join(get_root_path(), "features", "white"), mode=1)[['file_name', 'api_name', 'call_time']] black = load_df(os.path.join(get_root_path(), "features", "black"), mode=1)[['file_name', 'api_name', 'call_time']] white_label = np.zeros(white.shape[0]) black_label = np.ones(black.shape[0]) full = pd.concat([white, black], sort=False) label = np.concatenate((white_label, black_label)) full['label'] = label full['api_name'] = full['api_name'].map(api_dict) # full = load_df(os.path.join(get_root_path(), "features", "stage2"), mode=1)[['file_name', 'api_name', 'call_time']] full['label'] = np.zeros((full.shape[0], )) full['api_name'] = full['api_name'].map(api_dict)
from basic_function import load_df, save_dict import pandas as pd full = load_df("../features/stage2") # full.fillna("None", inplace) api_list = set(full['api_name']) print(len(api_list)) api_dict = dict(zip(api_list, range(len(api_list)))) save_dict(api_dict, "../features/api_dict.txt")