def read_data_extract_to_df(args): ''' Function to extract app data from respective json file of parsed api calls into dataframe of each |<app>|<block>|<api_call>|<package>| row combination. Parameters ---------- args: tuple, required Tuple of two arguments, a key of the names of each <app>, <block>, <api_call>, and <package> Returns ------- Dataframe of each |<app>|<block>|<api_call>|<package>| row combination. ''' multiprocessing = args[2] naming_key = args[1] fp = args[0] fn = os.path.basename(fp).replace(".json", "") blocks = np.array(jf.load_json(fp), dtype="object") blocks = list(filter(None, blocks)) #remove empty blocks args_mapping = {"naming_key": naming_key, "filename": fn} if multiprocessing: with Pool() as pool: parsed = pool.map(partial(get_nodes, **args_mapping), blocks) else: parsed = [] for block in blocks: parsed.append(get_nodes(block, **args_mapping)) parsed = merge_dictionaries(parsed) return pd.DataFrame(parsed) #.to_dataframe()
def EDA(src, verbose=True, lim=None, multiprocessing=False, save_data=True): ''' Function to run EDA. Parameters ---------- verbose: logical, optional. Default True If true print updates on eda. Do not print updates otherwise lim: int, optional. Default None If not `None` then limit apps by that ammount multiprocessing: logical, optional. Default False WARNING: DO NOT RUN! Currently not working. It is faster to run in serial than multi. If true run with multiprocessing. Run in serial otherwise save_data: logical, optional. Defualt True If true save <mal_df> and <ben_df> to <dst> mal_filename: str, required filename to save malware node data to ben_filename: str, required filename to save benign node data to src: str, required Path of data to run EDA on dst: str, required Path to save <mal_df> and <ben_df> as csv files key_fn: str, required Path to json file lookup table of node code for repectives <app>, <block>, <api_call>, and <package> strings ''' params = jf.load_json(src) params = update_paths(False, params) if verbose: print() print("Parameters:") pprint.pprint(params) eda_params = params["eda-params"] key_fn = os.path.join(eda_params["dict_directory"], eda_params["data_naming_key_filename"]) malware, benign = get_node_data( verbose=verbose, lim=lim, multiprocessing=multiprocessing, save_data=save_data, mal_filename=eda_params["malware_node_filename"], ben_filename=eda_params["benign_node_filename"], key_fn=key_fn, src=eda_params["data_extract_loc"], dst=eda_params["save_dir"]) return malware, benign
def create_w2v_embedding(path, path_to_unique_apis, **params): print("--- W2V Embedding ---") s = time.time() corp_size = params["size"] window_size = params["window"] work_size = params["workers"] # path_to_unique_apis=os.path.join(params[""]) api_list = json_functions.load_json( path_to_unique_apis)["get_key"]["calls"].keys() unique_apis = dict(zip(api_list, list(range(1, len(api_list) + 1)))) #key value pairs corpus = [] for root, dirs, lister in os.walk(path): continue for i in lister: if "checkpoint" in i: continue temp = json_functions.load_json(path + i) temp = [item.split(" ")[-1] for sublist in temp for item in sublist] corpus.append(temp) app_ids = list(range(0, len(corpus))) abst = [] content = [] for app in corpus: try: abstracted = [] for api in app: abstracted.append(unique_apis[api.split(" ")[-1]]) abst.append(abstracted) except: continue content.append(abst) content.append(app_ids) fp = os.path.join(params["save_dir"], params["content_filename"]) if params["verbose"]: print("Saved %s to %s" % (params["content_filename"], params["save_dir"])) # pickle.dump(content, open(fp, "wb")) print("Corpus construction done in " + str(time.time() - s) + " seconds with " + str(len(corpus)) + " documents") s = time.time() #Model print("corpus", np.array(content).shape) print("work_size", work_size) print("window_size", window_size) # return corpus model = Word2Vec(corpus, min_count=1, size=corp_size, workers=work_size, window=window_size, sg=1) return model save_w2v_embedding(model, corp_size, unique_apis, **params) print("Word2Vec done in " + str(time.time() - s) + " seconds")
def fast_dict(**kwargs): """Builds dictionaries which can be converted into matrices A,B,P,I, along with corrisponding test matrices :return; four dictionaries corresponding to matrices A,B,P,I and a test matrix A_test """ key_directory = kwargs["dict_directory"] verbose = kwargs["verbose"] direc = kwargs["out_path"] truncate = kwargs["truncate"] lower_bound_api_count = kwargs["lower_bound_api_count"] naming_key_filename = kwargs["data_naming_key_filename"] api_call_filename = kwargs["api_call_filename"] key_dst = os.path.join(key_directory, naming_key_filename) call_dst = os.path.join(key_directory, api_call_filename) def add_key(store, value, prefix, suffix, value_type): """ Takes a value and a dictionary to add the value to. Adds a key, value pair to dictionary if it does not already exist. Will return the key associated with a value. Key is created by concatenating the letter of the associated node, a,v,c,p, and i to the length of the lookup table. """ if value not in store["get_key"][value_type]: key = prefix + str(suffix) store["lookup_key"][key] = value store["get_key"][value_type][value] = key suffix += 1 else: key = store["get_key"][value_type][value] return key, suffix def append_value(store, key, value): """ Appends value to dictionary at index key. Returns dictionary with appended value """ if key in store: store[key].append(value) else: store[key] = [value] ######################### #FOR TRAIN PORTION OF SPLIT ######################### B = {} A = {} P = {} I = {} C = {} # c- prefix denotes api call # a- prefix denotes app # b- prefix denotes code block # p- prefix denotes package # i- prefix denotes invoke type key_lookup = { "get_key": { "apps": {}, #input a value and value type, i.e. "apps", etc, and get the associated key "blocks": {}, "packages": {}, "invokes": {}, "calls": {} }, "lookup_key": {} #input a key and get the associated value } list_of_files = [] for root, dirs, files in os.walk(direc): list_of_files.append(files) list_of_files = list( set([item for sublist in list_of_files for item in sublist])) random.shuffle(list_of_files) print(str(len(list_of_files)) + " Total Files for Dictionary Creation") ax = 0 #index for apps bx = 0 #index for blocks px = 0 #index for packages ix = 0 #index for invoke types cx = 0 #index for calls iix = 0 #keep track of iterations start_time = time.time() for file in tqdm(list_of_files): if "checkpoint" in file: #for stupid git ignores continue fn = direc + file filez = jf.load_json(fn) filename = file.rstrip(".json") akey, ax = add_key(key_lookup, filename, "a", ax, "apps") for block in filez: if len(block) > 0: #skip empty blocks full_block = " ".join(block) #add block to lookup table and get a key bkey, bx = add_key(key_lookup, full_block, "b", bx, "blocks") for call in block: try: api_call = call.split("}, ")[1].split(" ")[0].strip() ckey, cx = add_key(key_lookup, api_call, "c", cx, "calls") append_value(A, akey, ckey) #append key to dictionary append_value(B, bkey, ckey) package = call.split(";")[0].split(",")[-1].strip() pkey, px = add_key(key_lookup, package, "p", px, "packages") append_value(P, pkey, ckey) invoke_type = call.split("}, ")[0].split( " ")[0].strip() ikey, ix = add_key(key_lookup, invoke_type, "i", ix, "invokes") append_value(I, ikey, ckey) if ckey in C: C[ckey] = C[ckey] + 1 else: C[ckey] = 1 iix += 1 except: continue if truncate: if verbose: print() print( "Truncation is set to True, API calls only occuring less than lower_bound_api_count will be removed from the model." ) print("Number of API calls Before Truncation: " + str(len(B.keys()))) for i in [B, P, I, A]: #remove files where APIs occur less than lower_bound_api_count across whole data set # remove both keys and values from dict d = dict( (k, v) for k, v in C.items() if v <= lower_bound_api_count) for k in d.keys(): try: del i[k] except: continue if verbose: print("Number of API calls After Truncation: " + str(len(B.keys()))) print() #save the key_lookup table to "key_directory" config parameter in dict_build.json jf.save_json(key_lookup, key_dst) jf.save_json(C, call_dst) if verbose: print("Saving node key lookup table to: %s" % key_dst) print("Saving api call list to: %s" % call_dst) #save the key_lookup table to "key_directory" config parameter in dict_build.json return B, P, I, A
def get_node_data(display_data=True, **kwargs): ''' Function to get dataframes of malicious and benign apps. Dataframes are rowwise, with columns for each |<app>|<block>|<api_call>|<package>| combination. Parameters ---------- verbose: logical, required. Default True If true print updates on eda. Do not print updates otherwise lim: int, required. Default None If not `None` then limit apps by that ammount multiprocessing: logical, required. Default True If true run with multiprocessing. Run in serial otherwise save_data: logical, required. Defualt True If true save <mal_df> and <ben_df> to <dst> mal_filename: str, required filename to save malware node data to ben_filename: str, required filename to save benign node data to key_fn: str, required Path to json file lookup table of node code for repectives <app>, <block>, <api_call>, and <package> strings src: str, required Path of parsed smali code in json files dst: str, required Path to save <mal_df> and <ben_df> as csv files Returns ------- Two dataframes. arg1: Dataframe of malicious app-block-api call-package combinations arg2: Dataframe of mabenignlicious app-block-api call-package combinations ''' verbose = kwargs["verbose"] lim = kwargs["lim"] multiprocessing = kwargs["multiprocessing"] save_data = kwargs["save_data"] mal_filename = kwargs["mal_filename"] ben_filename = kwargs["ben_filename"] key_fn = kwargs["key_fn"] src = kwargs["src"] dst = kwargs["dst"] # print("test11") if verbose: print("Retrieving naming key from `%s`" % key_fn) start = time.time() key = jf.load_json(key_fn)["get_key"] print("Retrieved naming key in %i seconds" % (time.time() - start)) else: key = jf.load_json(key_fn)["get_key"] # print("key",key) # mal_apps, ben_apps=get_apps(src, lim) if lim == None: mal_apps = [(os.path.join(src, file), key, multiprocessing) for file in os.listdir(src) if "_M_" in file] ben_apps = [(os.path.join(src, file), key, multiprocessing) for file in os.listdir(src) if "_B_" in file] else: mal_apps = [(os.path.join(src, file), key, multiprocessing) for file in os.listdir(src) if "_M_" in file][:lim] ben_apps = [(os.path.join(src, file), key, multiprocessing) for file in os.listdir(src) if "_B_" in file][:lim] # progress=tqdm.tqdm mal_data = [] ben_data = [] if verbose: print( "Found %i malicious apps and %i benign apps to extract data from" % (len(mal_apps), len(ben_apps))) start = time.time() for app in tqdm(mal_apps): mal_data.append(read_data_extract_to_df(app)) print("Retrieved data on %i malicious apps in %i seconds" % (len(mal_data), time.time() - start)) start = time.time() for app in tqdm(ben_apps): ben_data.append(read_data_extract_to_df(app)) print("Retrieved data on %i benign apps in %i seconds" % (len(mal_data), time.time() - start)) else: for app in tqdm(mal_apps): mal_data.append(read_data_extract_to_df(app)) for app in tqdm(ben_apps): ben_data.append(read_data_extract_to_df(app)) mal_df = pd.concat(mal_data) ben_df = pd.concat(ben_data) if save_data: mal_fn = os.path.join(dst, mal_filename) ben_fn = os.path.join(dst, ben_filename) if verbose: print("Saving malware app data to %s" % mal_fn) print("Saving benign app data to %s" % ben_fn) if display_data: print("Malware data:") display(mal_df.head()) print("\nBenign data:") display(ben_df.head()) return mal_df, ben_df
pprint.pprint(params) eda_params = params["eda-params"] key_fn = os.path.join(eda_params["dict_directory"], eda_params["data_naming_key_filename"]) malware, benign = get_node_data( verbose=verbose, lim=lim, multiprocessing=multiprocessing, save_data=save_data, mal_filename=eda_params["malware_node_filename"], ben_filename=eda_params["benign_node_filename"], key_fn=key_fn, src=eda_params["data_extract_loc"], dst=eda_params["save_dir"]) return malware, benign if __name__ == "__main__": param_fp = os.path.join('config', 'params.json') params = jf.load_json(param_fp) eda_params = params['eda-params'] src = eda_params['data_extract_loc'] key_fn = eda_params["data_naming_key"] # pprint.pprint(params) # print("test") malware, benign = get_node_data(key_fn=key_fn, src=src)
---------- cmd_line_args: listOfStrings, required list of command line arguments passed params: dictionary, required parameter configuration dictionary pulled from `config/params.json` Returns ------- None ''' print() #get and apply command line arguments args_params = params["options"] cmd_line_args_dict = get_command_ling_args(cmd_line_args, args_params) params["eda-params"]["args_literal"] = cmd_line_args params = apply_command_line_args(cmd_line_args_dict, params) out_fn = os.path.join(params["out_path"], params["params_name"]) jf.save_json(params, out_fn) kwargs = {"cmd_line_args": cmd_line_args_dict, "params": params} Main.run_all(kwargs) print() sys.exit() if __name__ == "__main__": args = sys.argv[1:] data_params = jf.load_json("config/params.json") run(args, data_params)