def tianchi_api(): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=500) white = pd.read_csv(os.path.join("features", "white.csv"))[['file_name', 'api_name']] black = load_df(os.path.join("features", "black")) test = load_df(os.path.join("features", "test")) tianchi = pd.read_csv("security_train.csv").rename( columns={"file_id": "file_name"}) full = pd.concat([white, black, test]) full_str = to_str(full) print(1) api_vec.fit(full_str) print(2) black_output, name_list = to_str(tianchi, mode=1, column_name="api") save_dict(name_list, os.path.join("features", "tianchi_name_list")) black_output = api_vec.transform(black_output) scipy.sparse.save_npz(os.path.join("features", "tianchi.npz"), black_output)
def search(): base_parameter = {"data_type": 0, "dimension_reduction": 0} top_search = {"n_clusters": [380, 400, 420, 440, 460, 480]} cluster_ways = [0] cluster_parameter = {0: [{"linkage": 0}]} full_parameter = [] scores_list = [] for key in top_search.keys(): for i in range(len(top_search[key])): full_parameter.append({key:top_search[key][i]}) for cluster_way in cluster_ways: full_parameter.append({"cluster_way":cluster_way}) for cp in cluster_parameter[cluster_way]: full_parameter.append(cp) real_full = [base_parameter] + full_parameter print("params:", connect_params(real_full)) scores = train_cluster(**connect_params(real_full)) scores_list.append([connect_params(real_full)] + scores) full_parameter.pop(-1) full_parameter.pop(-1) full_parameter.pop(-1) print(scores_list) save_dict(scores_list, "search_b.txt") for i in range(3): # length of evaluation ways print(sorted(scores_list, key=lambda x:x[i+1]))
def stage2_api(feature_num=500): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=feature_num) white = pd.read_csv(os.path.join("features", "white.csv"))[['file_name', 'api_name']] black = load_df(os.path.join("features", "black")) test = load_df(os.path.join("features", "test")) stage2 = load_df(os.path.join("features", "stage2")) full = pd.concat([white, black, test]) full_str = to_str(full) print(1) api_vec.fit(full_str) print(2) black_output, name_list = to_str(stage2, mode=1) save_dict(name_list, os.path.join("features", "stage2_name_list" + str(feature_num))) black_output = api_vec.transform(black_output) scipy.sparse.save_npz( os.path.join("features", "stage2" + str(feature_num) + ".npz"), black_output)
def stage2_api_new(feature_num=500): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=feature_num) stage2 = load_df(os.path.join("features", "stage2")) black_output, name_list = to_str(stage2, mode=1) print(1) api_vec.fit(black_output) print(2) # black_output, name_list = to_str(stage2, mode=1) save_dict( name_list, os.path.join(get_root_path(), "features", "stage2_name_list" + str(feature_num))) black_output = api_vec.transform(black_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "stage2" + str(feature_num) + ".npz"), black_output)
def stage_2_attribute(suffix="_dll", use_less_value=False, type_name="", map_func=None, max_feature=1000): stage2 = load_df(os.path.join("features", "stage2" + suffix), mode=1) if use_less_value: if map_func is None: stage2["value"] = stage2["value"].map(lambda x: x.split("\\")[-1]) else: stage2["value"] = stage2["value"].map(lambda x: map_func(x)) stage2_output, name_list = to_str(stage2, mode=1, column_name="value") api_vec, _ = train_tf_idf(suffix="_dll", use_less_value=use_less_value, map_func=map_func, data=stage2_output, max_feature=max_feature) save_dict( name_list, os.path.join(get_root_path(), "features", "stage2_name_list" + suffix + type_name)) stage2_output = api_vec.transform(stage2_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "stage2" + suffix + type_name + ".npz"), stage2_output)
def api(): api_vec = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=500) white = pd.read_csv("white.csv")[['file_name', 'api_name']] black = load_df("black") test = load_df("test") full = pd.concat([white, black, test]) full_str = to_str(full) print(1) api_vec.fit(full_str) print(2) black_output, name_list = to_str(black, mode=1) save_dict(name_list, "black_name_list") black_output = api_vec.transform(black_output) scipy.sparse.save_npz("black.npz", black_output) white_output, name_list = to_str(white, mode=1) save_dict(name_list, "white_name_list") white_output = api_vec.transform(white_output) scipy.sparse.save_npz("white.npz", white_output) test_str, name_list = to_str(test, mode=1) save_dict(name_list, "test_name_list") test_output = api_vec.transform(test_str) scipy.sparse.save_npz("test.npz", test_output)
def attribution(suffix="_dll", use_less_value=False, type_name="", map_func=None, max_feature=2000): api_vec, data = train_tf_idf(suffix="_dll", use_less_value=use_less_value, map_func=map_func, max_feature=max_feature) white, black, test = data black_output, name_list = to_str(black, mode=1, column_name="value") save_dict( name_list, os.path.join(get_root_path(), "features", "black_name_list" + suffix + type_name)) black_output = api_vec.transform(black_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "black" + suffix + type_name + ".npz"), black_output) white_output, name_list = to_str(white, mode=1, column_name="value") save_dict( name_list, os.path.join(get_root_path(), "features", "white_name_list" + suffix + type_name)) white_output = api_vec.transform(white_output) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "white" + suffix + type_name + ".npz"), white_output) test_str, name_list = to_str(test, mode=1, column_name="value") save_dict( name_list, os.path.join(get_root_path(), "features", "test_name_list" + suffix + type_name)) test_output = api_vec.transform(test_str) scipy.sparse.save_npz( os.path.join(get_root_path(), "features", "test" + suffix + type_name + ".npz"), test_output)
np.save("api_list_stage2", final_api_list) np.save("file_name_list_stage2", file_name_list) # np.save("label_", label) except: print("error") else: for file_name, api_df in full.groupby('file_name'): api_df = api_df.sort_values(by="call_time", axis=0, kind="mergesort") result = delete_repeat_pattern(api_df['api_name'].values.tolist(), 2) result = delete_same_pattern(result, 3) final_api_list.append(result) label.append(api_df['label'].values[0]) try: save_dict(final_api_list, "./api_list_less.txt") save_dict(label, "./label_less.txt") except: print("error") # final_api_list = np.load("api_list_stage2.npy") # label = np.zeros((len(final_api_list))) # print(api_df) # print(final_api_list) fixed_sequence = pad_sequences(final_api_list, maxlen=shape[0], dtype='int32', padding='post', truncating='post', value=input_dim - 1)
from basic_function import load_df, save_dict import pandas as pd full = load_df("../features/stage2") # full.fillna("None", inplace) api_list = set(full['api_name']) print(len(api_list)) api_dict = dict(zip(api_list, range(len(api_list)))) save_dict(api_dict, "../features/api_dict.txt")