Пример #1
0
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''

    # df = df[df.source == src]
    # df = df[df.header_characteristics_bit13 == False]
    print("label encoding of strings features")
    df = exp_util.label_encode(df, res_dir)

    # df = exp_util.balance_four_sets(df, seed)
    if model_name == 'nn':
        # print("balancing per packer")
        cols = exp_util.DROP_COLUMNS + ['generic_fileSize']
        cols = [c for c in cols if c in df.columns]
        df = df[cols]
        df = exp_util.remove_large_samples(df)
        df = exp_util.balance_per_packer(df, seed, packers, minsize=4000)
        df = exp_util.import_bytes(df)
        df = df.drop(['generic_fileSize'], axis=1)
    else:
        # print("balancing per packer")
        df = exp_util.balance_per_packer(df, seed, packers)
        df = df.astype(np.float32, errors='ignore')

    return df
Пример #2
0
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''

    df = df[df.packer_name != 'dolphin-dropper-3']
    # df = df[df.source.isin(util.WILD_SRC)]
    packers = list(df.packer_name.unique())
    packers = [p for p in packers if p != 'none']

    indices = set(exp_util.balance_per_packer(df[df.packer_name.isin(packers)], seed, packers).index)
    indices = indices.union(set(df[df.packer_name == 'none'].index))
    df = df[df.index.isin(indices)]

    print("label encoding of strings features")
    df = exp_util.label_encode(df, res_dir)

    df = exp_util.balance_sets(df, seed, mode=0)

    df = df.astype(np.float32, errors='ignore')

    return df
Пример #3
0
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''

    df = df[df.source == src]
    df = exp_util.balance_per_packer(df, seed)
    all_packers = df.packer_name.unique()
    for gp in good_packers:
        assert gp in all_packers
    good_df = df[df.packer_name.isin(good_packers)]
    bad_df = df[~df.packer_name.isin(good_packers)]
    good_df_b = good_df[good_df.benign]
    good_df_m = good_df[good_df.malicious]
    bad_df_b = bad_df[bad_df.benign]
    bad_df_m = bad_df[bad_df.malicious]

    del good_df, bad_df
    import gc
    gc.collect()

    global train_indices, test_indices
    # training set = good_df_b + bad_df_m ---- keep in mind that we want a balanced training set, so:
    n = min(len(good_df_b), len(bad_df_m))
    good_df_b = good_df_b.sample(n, random_state=seed)
    bad_df_m = bad_df_m.sample(n, random_state=seed)
    train_indices = set(good_df_b.index).union(set(bad_df_m.index))
    del good_df_b, bad_df_m
    gc.collect()

    # test set = good_df_m + bad_df_b ---- keep in mind that we want a balanced test set also, so:
    n = min(len(good_df_m), len(bad_df_b))
    good_df_m = good_df_m.sample(n, random_state=seed)
    bad_df_b = bad_df_b.sample(n, random_state=seed)
    test_indices = set(good_df_m.index).union(set(bad_df_b.index))
    del good_df_m, bad_df_b
    gc.collect()

    indices = train_indices.union(set(test_indices))

    df = df[df.index.isin(indices)]
    
    print("label encoding of strings features")
    df = exp_util.label_encode(df, res_dir)

    df = df.astype(np.float32, errors='ignore')
    print("done with converting to float")
    return df
Пример #4
0
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''
    ember_indices = set(df[df.source == 'wild-ember'].index)
    ember_indices = ember_indices.union(set(df[df.unpacked_sample_id.isin(ember_indices)].index))

    dfp = df[df.source == src]
    dfp = dfp[dfp.index.isin(ember_indices)]
    dfp = exp_util.balance_per_packer(dfp, seed, minsize=5000) # for memory usage, we pick 5000 benign and 5000 malicious samples per each packer (instead of 6600~)

    wild = df[df.source == 'wild']
    # wild = wild[~wild.index.isin(list(dfp.unpacked_sample_id))]
    wild = wild[wild.packed]
    wildb = wild[wild.benign]
    wildm = wild[wild.malicious]
    l = min(len(wildb), len(wildm))
    assert l >= 1000
    wildb = wildb.sample(l, random_state=seed)
    wildm = wildm.sample(l, random_state=seed)
    global test_indices
    test_indices = set(wildb.index)
    test_indices = test_indices.union(set(wildm.index))
    del wildm, wildb
    global train_indices
    train_indices = set(dfp.index)
    indices = train_indices.union(test_indices)

    del dfp
    del wild
    import gc
    gc.collect()

    df = df[df.index.isin(indices)]

    print("label encoding of strings features")
    df = exp_util.label_encode(df, res_dir)

    gc.collect()
    # import numpy as np
    # df = df.astype(np.float32, errors='ignore', copy=False)
    # print("done with converting to float")
    return df
Пример #5
0
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''

    print("label encoding of strings features")
    df = exp_util.label_encode(df, res_dir)

    # print("balancing per packer")
    df = exp_util.balance_per_packer(df, seed, [packer])

    df = df.astype(np.float32, errors='ignore')

    return df
Пример #6
0
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''

    # wild = df[df.source.isin(util.WILD_SRC)]

    # wildb = wild[wild.benign]
    # wildm = wild[wild.malicious]
    # n = min(len(wildb), len(wildm))
    # wildb = wildb.sample(n, random_state=seed)
    # wildm = wildm.sample(n, random_state=seed)
    # wild = wild[wild.index.isin(list(wildb.index) + list(wildm.index))]
    # global train_indices
    # train_indices = set(wild.index)
    # del wild
    # del wildb
    # del wildm
    # import gc
    # gc.collect()
    for p in packers:
        assert p in df.packer_name.unique()
    df = df[df.packer_name.isin(packers)]

    df = exp_util.balance_per_packer(df, seed)
    global test_indices, train_indices
    train_indices = set(df[df.packer_name.isin(train_packer)].index)
    test_indices = set(df[df.packer_name.isin(test_packer)].index)
    indices = train_indices.union(test_indices)

    df = df[df.index.isin(indices)]

    print("label encoding of strings features")
    df = exp_util.label_encode(df, res_dir)

    print("converting to float")
    import numpy as np
    df = df.astype(np.float32, errors='ignore', copy=False)
    print("done with converting")

    return df
Пример #7
0
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''

    wild = df[df.source.isin(util.WILD_SRC)]
    wildb = wild[wild.benign]
    wildm = wild[wild.malicious]
    n = min(len(wildb), len(wildm))
    wildb = wildb.sample(n, random_state=seed)
    wildm = wildm.sample(n, random_state=seed)
    indices = list(wildb.index) + list(wildm.index)
    del wild
    del wildb
    del wildm
    import gc
    gc.collect()

    dfp = df[df.source == 'lab-v3']
    dfp = exp_util.balance_per_packer(dfp, seed)
    indices.extend(list(dfp.index))
    del dfp
    gc.collect()
    df = df[df.index.isin(indices)]

    print("label encoding of strings features")
    df = exp_util.label_encode(df)

    # df = exp_util.balance_four_sets(df, seed)

    # df = df.astype(np.float32, errors='ignore')

    return df