Exemplo n.º 1
0
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''

    df = df[df.packer_name != 'dolphin-dropper-3']
    # df = df[df.source.isin(util.WILD_SRC)]
    packers = list(df.packer_name.unique())
    packers = [p for p in packers if p != 'none']

    indices = set(exp_util.balance_per_packer(df[df.packer_name.isin(packers)], seed, packers).index)
    indices = indices.union(set(df[df.packer_name == 'none'].index))
    df = df[df.index.isin(indices)]

    print("label encoding of strings features")
    df = exp_util.label_encode(df, res_dir)

    df = exp_util.balance_sets(df, seed, mode=0)

    df = df.astype(np.float32, errors='ignore')

    return df
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''

    wild = df[df.source.isin(util.WILD_SRC)]
    wild = exp_util.balance_sets(wild, seed, mode=1)

    global dfp
    dfp = df[df.packer_name == packer]
    # dfp = exp_util.balance_per_packer(dfp, seed, [packer])

    global train_indices
    train_indices = set(wild.index)
    test_indices = set(dfp.index)
    indices = train_indices.union(test_indices)
    df = df[df.index.isin(indices)]

    print("label encoding of strings features")
    df = exp_util.label_encode(df, res_dir)

    df = df.astype(np.float32, errors='ignore')

    return df
Exemplo n.º 3
0
def process_dataset(df, seed):
    '''
    Process the entire dataset just one time to save memory
    param df pandas dataframe
    :rtype: Tuple(pandas.dataframe)
    :return: The original arguments as a tuple and their concatenation
    '''

    # df = df[df.source.isin(util.WILD_SRC)]

    print("label encoding of strings features")
    df = exp_util.label_encode(df, res_dir)

    df = exp_util.balance_sets(df, seed, mode=1)

    df = df.astype(np.float32, errors='ignore')

    return df