Пример #1
0
def tianchi_api():
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=500)

    white = pd.read_csv(os.path.join("features",
                                     "white.csv"))[['file_name', 'api_name']]
    black = load_df(os.path.join("features", "black"))
    test = load_df(os.path.join("features", "test"))
    tianchi = pd.read_csv("security_train.csv").rename(
        columns={"file_id": "file_name"})

    full = pd.concat([white, black, test])
    full_str = to_str(full)

    print(1)
    api_vec.fit(full_str)

    print(2)

    black_output, name_list = to_str(tianchi, mode=1, column_name="api")
    save_dict(name_list, os.path.join("features", "tianchi_name_list"))
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz(os.path.join("features", "tianchi.npz"),
                          black_output)
Пример #2
0
def search():
    base_parameter = {"data_type": 0, "dimension_reduction": 0}

    top_search = {"n_clusters": [380, 400, 420, 440, 460, 480]}

    cluster_ways = [0]
    cluster_parameter = {0: [{"linkage": 0}]}

    full_parameter = []
    scores_list = []
    for key in top_search.keys():
        for i in range(len(top_search[key])):
            full_parameter.append({key:top_search[key][i]})
            for cluster_way in cluster_ways:
                full_parameter.append({"cluster_way":cluster_way})
                for cp in cluster_parameter[cluster_way]:
                    full_parameter.append(cp)
                    real_full = [base_parameter] + full_parameter
                    print("params:", connect_params(real_full))
                    scores = train_cluster(**connect_params(real_full))
                    scores_list.append([connect_params(real_full)] + scores)
                    full_parameter.pop(-1)
                full_parameter.pop(-1)
            full_parameter.pop(-1)

    print(scores_list)

    save_dict(scores_list, "search_b.txt")

    for i in range(3):  # length of evaluation ways
        print(sorted(scores_list, key=lambda x:x[i+1]))
Пример #3
0
def stage2_api(feature_num=500):
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=feature_num)

    white = pd.read_csv(os.path.join("features",
                                     "white.csv"))[['file_name', 'api_name']]
    black = load_df(os.path.join("features", "black"))
    test = load_df(os.path.join("features", "test"))
    stage2 = load_df(os.path.join("features", "stage2"))

    full = pd.concat([white, black, test])
    full_str = to_str(full)

    print(1)
    api_vec.fit(full_str)

    print(2)

    black_output, name_list = to_str(stage2, mode=1)
    save_dict(name_list,
              os.path.join("features", "stage2_name_list" + str(feature_num)))
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz(
        os.path.join("features", "stage2" + str(feature_num) + ".npz"),
        black_output)
Пример #4
0
def stage2_api_new(feature_num=500):
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=feature_num)

    stage2 = load_df(os.path.join("features", "stage2"))

    black_output, name_list = to_str(stage2, mode=1)

    print(1)
    api_vec.fit(black_output)

    print(2)

    # black_output, name_list = to_str(stage2, mode=1)
    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "stage2_name_list" + str(feature_num)))
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "stage2" + str(feature_num) + ".npz"), black_output)
Пример #5
0
def stage_2_attribute(suffix="_dll",
                      use_less_value=False,
                      type_name="",
                      map_func=None,
                      max_feature=1000):
    stage2 = load_df(os.path.join("features", "stage2" + suffix), mode=1)

    if use_less_value:
        if map_func is None:
            stage2["value"] = stage2["value"].map(lambda x: x.split("\\")[-1])
        else:
            stage2["value"] = stage2["value"].map(lambda x: map_func(x))
    stage2_output, name_list = to_str(stage2, mode=1, column_name="value")
    api_vec, _ = train_tf_idf(suffix="_dll",
                              use_less_value=use_less_value,
                              map_func=map_func,
                              data=stage2_output,
                              max_feature=max_feature)

    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "stage2_name_list" + suffix + type_name))
    stage2_output = api_vec.transform(stage2_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "stage2" + suffix + type_name + ".npz"), stage2_output)
Пример #6
0
def api():
    api_vec = TfidfVectorizer(ngram_range=(1, 5),
                              min_df=3,
                              max_df=0.9,
                              strip_accents='unicode',
                              use_idf=1,
                              smooth_idf=1,
                              sublinear_tf=1,
                              max_features=500)

    white = pd.read_csv("white.csv")[['file_name', 'api_name']]
    black = load_df("black")
    test = load_df("test")

    full = pd.concat([white, black, test])
    full_str = to_str(full)

    print(1)
    api_vec.fit(full_str)

    print(2)
    black_output, name_list = to_str(black, mode=1)
    save_dict(name_list, "black_name_list")
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz("black.npz", black_output)

    white_output, name_list = to_str(white, mode=1)
    save_dict(name_list, "white_name_list")
    white_output = api_vec.transform(white_output)
    scipy.sparse.save_npz("white.npz", white_output)

    test_str, name_list = to_str(test, mode=1)
    save_dict(name_list, "test_name_list")
    test_output = api_vec.transform(test_str)
    scipy.sparse.save_npz("test.npz", test_output)
Пример #7
0
def attribution(suffix="_dll",
                use_less_value=False,
                type_name="",
                map_func=None,
                max_feature=2000):
    api_vec, data = train_tf_idf(suffix="_dll",
                                 use_less_value=use_less_value,
                                 map_func=map_func,
                                 max_feature=max_feature)

    white, black, test = data

    black_output, name_list = to_str(black, mode=1, column_name="value")
    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "black_name_list" + suffix + type_name))
    black_output = api_vec.transform(black_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "black" + suffix + type_name + ".npz"), black_output)

    white_output, name_list = to_str(white, mode=1, column_name="value")
    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "white_name_list" + suffix + type_name))
    white_output = api_vec.transform(white_output)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "white" + suffix + type_name + ".npz"), white_output)

    test_str, name_list = to_str(test, mode=1, column_name="value")
    save_dict(
        name_list,
        os.path.join(get_root_path(), "features",
                     "test_name_list" + suffix + type_name))
    test_output = api_vec.transform(test_str)
    scipy.sparse.save_npz(
        os.path.join(get_root_path(), "features",
                     "test" + suffix + type_name + ".npz"), test_output)
Пример #8
0
        np.save("api_list_stage2", final_api_list)
        np.save("file_name_list_stage2", file_name_list)
        # np.save("label_", label)
    except:
        print("error")
else:
    for file_name, api_df in full.groupby('file_name'):
        api_df = api_df.sort_values(by="call_time", axis=0, kind="mergesort")
        result = delete_repeat_pattern(api_df['api_name'].values.tolist(), 2)
        result = delete_same_pattern(result, 3)

        final_api_list.append(result)

        label.append(api_df['label'].values[0])
    try:
        save_dict(final_api_list, "./api_list_less.txt")
        save_dict(label, "./label_less.txt")
    except:
        print("error")

# final_api_list = np.load("api_list_stage2.npy")
# label = np.zeros((len(final_api_list)))
# print(api_df)
# print(final_api_list)
fixed_sequence = pad_sequences(final_api_list,
                               maxlen=shape[0],
                               dtype='int32',
                               padding='post',
                               truncating='post',
                               value=input_dim - 1)
Пример #9
0
from basic_function import load_df, save_dict
import pandas as pd

full = load_df("../features/stage2")

# full.fillna("None", inplace)
api_list = set(full['api_name'])
print(len(api_list))
api_dict = dict(zip(api_list, range(len(api_list))))
save_dict(api_dict, "../features/api_dict.txt")