Exemplo n.º 1
0
def read_data_extract_to_df(args):
    '''
    Function to extract app data from respective json file of parsed api calls into dataframe of 
    each |<app>|<block>|<api_call>|<package>| row combination.

    Parameters
    ----------
    args: tuple, required
        Tuple of two arguments, a key of the names of each <app>, <block>, <api_call>, and <package>

    Returns
    -------
    Dataframe of each |<app>|<block>|<api_call>|<package>| row combination.
    '''
    multiprocessing = args[2]
    naming_key = args[1]
    fp = args[0]
    fn = os.path.basename(fp).replace(".json", "")

    blocks = np.array(jf.load_json(fp), dtype="object")
    blocks = list(filter(None, blocks))  #remove empty blocks

    args_mapping = {"naming_key": naming_key, "filename": fn}

    if multiprocessing:
        with Pool() as pool:
            parsed = pool.map(partial(get_nodes, **args_mapping), blocks)
    else:
        parsed = []
        for block in blocks:
            parsed.append(get_nodes(block, **args_mapping))
    parsed = merge_dictionaries(parsed)
    return pd.DataFrame(parsed)  #.to_dataframe()
Exemplo n.º 2
0
def EDA(src, verbose=True, lim=None, multiprocessing=False, save_data=True):
    '''
    Function to run EDA.

    Parameters
    ----------
        verbose: logical, optional. Default True
            If true print updates on eda. Do not print updates otherwise
        lim: int, optional. Default None
            If not `None` then limit apps by that ammount 
        multiprocessing: logical, optional. Default False
            WARNING: DO NOT RUN! Currently not working. It is faster to run in serial than multi. 
            If true run with multiprocessing. Run in serial otherwise         
        save_data: logical, optional. Defualt True
            If true save <mal_df> and <ben_df> to <dst>
        mal_filename: str, required
            filename to save malware node data to
        ben_filename: str, required
            filename to save benign node data to 
        src: str, required
            Path of data to run EDA on
        dst: str, required
            Path to save <mal_df> and <ben_df> as csv files
        key_fn: str, required
            Path to json file lookup table of node code for repectives <app>, <block>, <api_call>, and <package> 
            strings  
    '''

    params = jf.load_json(src)
    params = update_paths(False, params)

    if verbose:
        print()
        print("Parameters:")
        pprint.pprint(params)

    eda_params = params["eda-params"]

    key_fn = os.path.join(eda_params["dict_directory"],
                          eda_params["data_naming_key_filename"])

    malware, benign = get_node_data(
        verbose=verbose,
        lim=lim,
        multiprocessing=multiprocessing,
        save_data=save_data,
        mal_filename=eda_params["malware_node_filename"],
        ben_filename=eda_params["benign_node_filename"],
        key_fn=key_fn,
        src=eda_params["data_extract_loc"],
        dst=eda_params["save_dir"])
    return malware, benign
Exemplo n.º 3
0
def create_w2v_embedding(path, path_to_unique_apis, **params):
    print("--- W2V Embedding ---")
    s = time.time()

    corp_size = params["size"]
    window_size = params["window"]
    work_size = params["workers"]
    #     path_to_unique_apis=os.path.join(params[""])
    api_list = json_functions.load_json(
        path_to_unique_apis)["get_key"]["calls"].keys()
    unique_apis = dict(zip(api_list,
                           list(range(1,
                                      len(api_list) + 1))))  #key value pairs
    corpus = []

    for root, dirs, lister in os.walk(path):
        continue

    for i in lister:
        if "checkpoint" in i:
            continue
        temp = json_functions.load_json(path + i)
        temp = [item.split(" ")[-1] for sublist in temp for item in sublist]
        corpus.append(temp)
    app_ids = list(range(0, len(corpus)))
    abst = []
    content = []
    for app in corpus:
        try:
            abstracted = []
            for api in app:
                abstracted.append(unique_apis[api.split(" ")[-1]])
            abst.append(abstracted)
        except:
            continue
    content.append(abst)
    content.append(app_ids)
    fp = os.path.join(params["save_dir"], params["content_filename"])
    if params["verbose"]:
        print("Saved %s to %s" %
              (params["content_filename"], params["save_dir"]))


#     pickle.dump(content, open(fp, "wb"))

    print("Corpus construction done in " + str(time.time() - s) +
          " seconds with " + str(len(corpus)) + " documents")
    s = time.time()

    #Model
    print("corpus", np.array(content).shape)
    print("work_size", work_size)
    print("window_size", window_size)
    #     return corpus
    model = Word2Vec(corpus,
                     min_count=1,
                     size=corp_size,
                     workers=work_size,
                     window=window_size,
                     sg=1)
    return model
    save_w2v_embedding(model, corp_size, unique_apis, **params)
    print("Word2Vec done in " + str(time.time() - s) + " seconds")
Exemplo n.º 4
0
def fast_dict(**kwargs):
    """Builds dictionaries which can be converted into matrices A,B,P,I, along with corrisponding test matrices

    :return; four dictionaries corresponding to matrices A,B,P,I and a test matrix A_test
    """
    key_directory = kwargs["dict_directory"]
    verbose = kwargs["verbose"]
    direc = kwargs["out_path"]
    truncate = kwargs["truncate"]
    lower_bound_api_count = kwargs["lower_bound_api_count"]
    naming_key_filename = kwargs["data_naming_key_filename"]
    api_call_filename = kwargs["api_call_filename"]

    key_dst = os.path.join(key_directory, naming_key_filename)
    call_dst = os.path.join(key_directory, api_call_filename)

    def add_key(store, value, prefix, suffix, value_type):
        """
        Takes a value and a dictionary to add the value to. Adds a key, value pair to dictionary if it does not already exist. 
        Will return the key associated with a value. 
        Key is created by concatenating the letter of the associated node, a,v,c,p, and i to the length of the lookup table.
        """
        if value not in store["get_key"][value_type]:
            key = prefix + str(suffix)
            store["lookup_key"][key] = value
            store["get_key"][value_type][value] = key
            suffix += 1
        else:
            key = store["get_key"][value_type][value]
        return key, suffix

    def append_value(store, key, value):
        """
        Appends value to dictionary at index key. Returns dictionary with appended value
        """
        if key in store:
            store[key].append(value)
        else:
            store[key] = [value]

    #########################
    #FOR TRAIN PORTION OF SPLIT
    #########################
    B = {}
    A = {}
    P = {}
    I = {}
    C = {}

    # c- prefix denotes api call
    # a- prefix denotes app
    # b- prefix denotes code block
    # p- prefix denotes package
    # i- prefix denotes invoke type
    key_lookup = {
        "get_key": {
            "apps":
            {},  #input a value and value type, i.e. "apps", etc, and get the associated key
            "blocks": {},
            "packages": {},
            "invokes": {},
            "calls": {}
        },
        "lookup_key": {}  #input a key and get the associated value
    }
    list_of_files = []
    for root, dirs, files in os.walk(direc):
        list_of_files.append(files)

    list_of_files = list(
        set([item for sublist in list_of_files for item in sublist]))
    random.shuffle(list_of_files)
    print(str(len(list_of_files)) + " Total Files for Dictionary Creation")

    ax = 0  #index for apps
    bx = 0  #index for blocks
    px = 0  #index for packages
    ix = 0  #index for invoke types
    cx = 0  #index for calls
    iix = 0  #keep track of iterations
    start_time = time.time()
    for file in tqdm(list_of_files):
        if "checkpoint" in file:  #for stupid git ignores
            continue
        fn = direc + file
        filez = jf.load_json(fn)

        filename = file.rstrip(".json")
        akey, ax = add_key(key_lookup, filename, "a", ax, "apps")

        for block in filez:

            if len(block) > 0:  #skip empty blocks
                full_block = " ".join(block)
                #add block to lookup table and get a key
                bkey, bx = add_key(key_lookup, full_block, "b", bx, "blocks")

                for call in block:
                    try:
                        api_call = call.split("}, ")[1].split(" ")[0].strip()
                        ckey, cx = add_key(key_lookup, api_call, "c", cx,
                                           "calls")
                        append_value(A, akey, ckey)  #append key to dictionary
                        append_value(B, bkey, ckey)

                        package = call.split(";")[0].split(",")[-1].strip()
                        pkey, px = add_key(key_lookup, package, "p", px,
                                           "packages")
                        append_value(P, pkey, ckey)

                        invoke_type = call.split("}, ")[0].split(
                            " ")[0].strip()
                        ikey, ix = add_key(key_lookup, invoke_type, "i", ix,
                                           "invokes")
                        append_value(I, ikey, ckey)

                        if ckey in C:
                            C[ckey] = C[ckey] + 1
                        else:
                            C[ckey] = 1
                        iix += 1
                    except:
                        continue
    if truncate:
        if verbose:
            print()
            print(
                "Truncation is set to True, API calls only occuring less than lower_bound_api_count will be removed from the model."
            )
            print("Number of API calls Before Truncation: " +
                  str(len(B.keys())))
        for i in [B, P, I, A]:
            #remove files where APIs occur less than lower_bound_api_count across whole data set
            # remove both keys and values from dict
            d = dict(
                (k, v) for k, v in C.items() if v <= lower_bound_api_count)
            for k in d.keys():
                try:
                    del i[k]
                except:
                    continue
        if verbose:
            print("Number of API calls After Truncation:  " +
                  str(len(B.keys())))
            print()
    #save the key_lookup table to "key_directory" config parameter in dict_build.json
    jf.save_json(key_lookup, key_dst)
    jf.save_json(C, call_dst)
    if verbose:
        print("Saving node key lookup table to: %s" % key_dst)
        print("Saving api call list to: %s" % call_dst)
    #save the key_lookup table to "key_directory" config parameter in dict_build.json
    return B, P, I, A
Exemplo n.º 5
0
def get_node_data(display_data=True, **kwargs):
    '''
    Function to get dataframes of malicious and benign apps. 
    Dataframes are rowwise, with columns for each |<app>|<block>|<api_call>|<package>|
    combination.

    Parameters
    ----------
    verbose: logical, required. Default True
        If true print updates on eda. Do not print updates otherwise
    lim: int, required. Default None
        If not `None` then limit apps by that ammount 
    multiprocessing: logical, required. Default True
        If true run with multiprocessing. Run in serial otherwise
    save_data: logical, required. Defualt True
        If true save <mal_df> and <ben_df> to <dst>
    mal_filename: str, required
        filename to save malware node data to
    ben_filename: str, required
        filename to save benign node data to
    key_fn: str, required
        Path to json file lookup table of node code for repectives <app>, <block>, <api_call>, and <package>
        strings    
    src: str, required
        Path of parsed smali code in json files
    dst: str, required
        Path to save <mal_df> and <ben_df> as csv files
    Returns
    -------
    Two dataframes. 
    arg1: Dataframe of malicious app-block-api call-package combinations
    arg2: Dataframe of mabenignlicious app-block-api call-package combinations
    '''
    verbose = kwargs["verbose"]
    lim = kwargs["lim"]
    multiprocessing = kwargs["multiprocessing"]
    save_data = kwargs["save_data"]
    mal_filename = kwargs["mal_filename"]
    ben_filename = kwargs["ben_filename"]
    key_fn = kwargs["key_fn"]
    src = kwargs["src"]
    dst = kwargs["dst"]

    # print("test11")
    if verbose:
        print("Retrieving naming key from `%s`" % key_fn)
        start = time.time()
        key = jf.load_json(key_fn)["get_key"]
        print("Retrieved naming key in %i seconds" % (time.time() - start))
    else:
        key = jf.load_json(key_fn)["get_key"]
    # print("key",key)
    # mal_apps, ben_apps=get_apps(src, lim)
    if lim == None:
        mal_apps = [(os.path.join(src, file), key, multiprocessing)
                    for file in os.listdir(src) if "_M_" in file]
        ben_apps = [(os.path.join(src, file), key, multiprocessing)
                    for file in os.listdir(src) if "_B_" in file]
    else:
        mal_apps = [(os.path.join(src, file), key, multiprocessing)
                    for file in os.listdir(src) if "_M_" in file][:lim]
        ben_apps = [(os.path.join(src, file), key, multiprocessing)
                    for file in os.listdir(src) if "_B_" in file][:lim]
    # progress=tqdm.tqdm
    mal_data = []
    ben_data = []

    if verbose:
        print(
            "Found %i malicious apps and %i benign apps to extract data from" %
            (len(mal_apps), len(ben_apps)))
        start = time.time()
        for app in tqdm(mal_apps):
            mal_data.append(read_data_extract_to_df(app))
        print("Retrieved data on %i malicious apps in %i seconds" %
              (len(mal_data), time.time() - start))

        start = time.time()
        for app in tqdm(ben_apps):
            ben_data.append(read_data_extract_to_df(app))
        print("Retrieved data on %i benign apps in %i seconds" %
              (len(mal_data), time.time() - start))

    else:
        for app in tqdm(mal_apps):
            mal_data.append(read_data_extract_to_df(app))
        for app in tqdm(ben_apps):
            ben_data.append(read_data_extract_to_df(app))

    mal_df = pd.concat(mal_data)
    ben_df = pd.concat(ben_data)

    if save_data:
        mal_fn = os.path.join(dst, mal_filename)
        ben_fn = os.path.join(dst, ben_filename)
        if verbose:
            print("Saving malware app data to %s" % mal_fn)
            print("Saving benign app data to %s" % ben_fn)

    if display_data:
        print("Malware data:")
        display(mal_df.head())
        print("\nBenign data:")
        display(ben_df.head())
    return mal_df, ben_df
Exemplo n.º 6
0
        pprint.pprint(params)

    eda_params = params["eda-params"]

    key_fn = os.path.join(eda_params["dict_directory"],
                          eda_params["data_naming_key_filename"])

    malware, benign = get_node_data(
        verbose=verbose,
        lim=lim,
        multiprocessing=multiprocessing,
        save_data=save_data,
        mal_filename=eda_params["malware_node_filename"],
        ben_filename=eda_params["benign_node_filename"],
        key_fn=key_fn,
        src=eda_params["data_extract_loc"],
        dst=eda_params["save_dir"])
    return malware, benign


if __name__ == "__main__":
    param_fp = os.path.join('config', 'params.json')
    params = jf.load_json(param_fp)
    eda_params = params['eda-params']

    src = eda_params['data_extract_loc']
    key_fn = eda_params["data_naming_key"]

    # pprint.pprint(params)
    # print("test")
    malware, benign = get_node_data(key_fn=key_fn, src=src)
Exemplo n.º 7
0
    ----------
    cmd_line_args: listOfStrings, required
        list of command line arguments passed
    params: dictionary, required
        parameter configuration dictionary pulled from `config/params.json`
    Returns
    -------
    None
    '''
    print()
    #get and apply command line arguments
    args_params = params["options"]
    cmd_line_args_dict = get_command_ling_args(cmd_line_args, args_params)
    params["eda-params"]["args_literal"] = cmd_line_args
    params = apply_command_line_args(cmd_line_args_dict, params)

    out_fn = os.path.join(params["out_path"], params["params_name"])
    jf.save_json(params, out_fn)

    kwargs = {"cmd_line_args": cmd_line_args_dict, "params": params}

    Main.run_all(kwargs)
    print()
    sys.exit()


if __name__ == "__main__":

    args = sys.argv[1:]
    data_params = jf.load_json("config/params.json")
    run(args, data_params)