def get_categories(directory): '''get category dict''' # progressbar file_count = count_file(directory, HCG_FILE_NAME) pbar = use_progressbar('Get categories...', file_count) pbar.start() progress = 0 category_dict = dict() category_index = 0 for parent, dirnames, filenames in os.walk(directory): for filename in filenames: if filename == HCG_FILE_NAME: category = os.path.basename( os.path.split(os.path.split(parent)[0])[0]) if category not in category_dict: category_dict[category] = category_index category_index += 1 # progressbar progress += 1 pbar.update(progress) # progressbar pbar.finish() return category_dict
def remove_emtpy_hcg(directory): # progressbar file_count = count_file(directory, HCG_FILE_NAME) pbar = use_progressbar('Removing empty hcgs...', file_count) pbar.start() progress = 0 emtpy_hcg_list = [] count = 0 for parent, dirnames, filenames in os.walk(directory): for filename in filenames: if filename == HCG_FILE_NAME: hcg_file = os.path.join(parent, filename) statinfo = os.stat(hcg_file) hcg_size = statinfo.st_size if hcg_size <= 2: emtpy_hcg_list.append(hcg_file) count += 1 os.remove(hcg_file) # progressbar progress += 1 pbar.update(progress) # progressbar pbar.finish() print '[SC] Removed %d empty hcgs' % count for empty_hcg in emtpy_hcg_list: print empty_hcg
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--directory', help='directory of the apk') args = parser.parse_args() if args.directory: # progressbar file_count = count_file(args.directory, '.apk') pbar = use_progressbar('networkxifying call graph...', file_count) pbar.start() progress = 0 for parent, dirnames, filenames in os.walk(args.directory): for filename in filenames: if filename.endswith('.apk'): # print(os.path.join(parent, filename)) cg, graphdir = generate(os.path.join(parent, filename)) fcg = networkxify(cg) h = os.path.splitext(filename)[0] fnx = os.path.join(graphdir, "{}.pz".format(h)) pz.save(fcg, os.path.join(graphdir, fnx)) # progressbar progress += 1 pbar.update(progress) # progressbar pbar.finish() else: parser.print_help()
def count(directory): '''count all the indivisual hash values''' # 1. iterate through all the hcg.json files # 2. get hash values from a hcg.json file # 3. merge the hash values into one file # progressbar file_count = count_file(directory, HCG_FILE_NAME) pbar = use_progressbar('Calculating maximum occurrence', file_count) pbar.start() progress = 0 hash_dict = dict() for parent, dirnames, filenames in os.walk(directory): for filename in filenames: if filename == HCG_FILE_NAME: # if filename == 'hcg.json': hash_dict = merge_hash_dict(get_hash(os.path.join(parent, filename)), hash_dict) # progressbar progress += 1 pbar.update(progress) # progressbar pbar.finish() return hash_dict
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--directory', help='directory of the apk') args = parser.parse_args() if args.directory: # progressbar file_count = count_file(args.directory, HCG_FILE_NAME) pbar = use_progressbar('double hashing...', file_count) pbar.start() progress = 0 for parent, dirnames, filenames in os.walk(args.directory): for filename in filenames: if filename == HCG_FILE_NAME: graphdir = parent hcg = read_hashed_call_graph(os.path.join( parent, filename)) for node in hcg: hcg[node]['label'] = hcg[node]['nhash'] double_hcg = neighborhood_hash(hcg, graphdir) save_to_file(double_hcg, graphdir) # progressbar progress += 1 pbar.update(progress) # progressbar pbar.finish() else: parser.print_help()
def embed_all(directory): '''iteratively embed all the hashed call graph into a sparse matrix''' # 1. get category dict category_dict = get_categories(directory) # progressbar file_count = count_file(directory, HCG_FILE_NAME) pbar = use_progressbar('Computing label histogram...', file_count) pbar.start() progress = 0 # 2. iteratively embed all the hashed call graph into matrix # the label of each hashed call graph stored in truth_label # record filenames in filename_list also matrix = [] truth_label = np.array([]) category_label = np.array([]) filename_list = [] for parent, dirnames, filenames in os.walk(directory): for filename in filenames: if filename == HCG_FILE_NAME: category = os.path.basename( os.path.split(os.path.split(parent)[0])[0]) truth_label = np.append(truth_label, category_dict[category]) category_label = np.append(category_label, category) hcg = read_hashed_call_graph(os.path.join(parent, filename)) x_i = compute_label_histogram(hcg) matrix.append(x_i) filename_list.append(os.path.split(parent)[1]) # progressbar progress += 1 pbar.update(progress) # progressbar pbar.finish() # 3. convert matrix to binary print '[SC] Converting python list to numpy matrix...' matrix = np.array(matrix, dtype=np.int16) save_as_arff(matrix, category_label) # print '[SC] Converting features vectors to binary...' # matrix, m = ml.make_binary(matrix) m = 0 return matrix, m, truth_label, filename_list
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--directory', help='directory of the apk') args = parser.parse_args() if args.directory: # progressbar file_count = count_file(args.directory, '.apk') pbar = use_progressbar('Generating hashed call graph...', file_count) pbar.start() progress = 0 graph_time_list = [] hash_time_list = [] graph_node_time_list = [] hash_node_time_list = [] min_file_size = sys.maxint max_file_size = 0 min_node_count = sys.maxint max_node_count = 0 min_graph_time = sys.maxint max_graph_time = 0 min_hash_time = sys.maxint max_hash_time = 0 for parent, dirnames, filenames in os.walk(args.directory): for filename in filenames: if filename.endswith('.apk'): apk_file = os.path.join(parent, filename) # check file size file_size = os.stat(apk_file).st_size # graph generation and neighborhood hash start_time = time.time() cg, graphdir = generate(apk_file) graph_time = time.time() - start_time start_time = time.time() hash_cg = neighborhood_hash(cg, graphdir) hash_time = time.time() - start_time save_to_file(hash_cg, graphdir) graph_time_coordinate = (file_size/(10**6), graph_time) hash_time_coordinate = (file_size/(10**6), hash_time) graph_node_time_coordinate = (len(cg), graph_time) hash_node_time_coordinate = (len(cg), hash_time) graph_time_list.append(graph_time_coordinate) hash_time_list.append(hash_time_coordinate) graph_node_time_list.append(graph_node_time_coordinate) hash_node_time_list.append(hash_node_time_coordinate) if file_size > max_file_size: max_file_size = file_size if len(cg) > max_node_count: max_node_count = len(cg) if file_size < min_file_size: min_file_size = file_size if len(cg) < min_node_count: min_node_count = len(cg) if graph_time > max_graph_time: max_graph_time = graph_time if graph_time < min_graph_time: min_graph_time = graph_time if hash_time > max_hash_time: max_hash_time = hash_time if hash_time < min_hash_time: min_hash_time = hash_time # progressbar progress += 1 pbar.update(progress) # progressbar pbar.finish() # sort list graph_time_list.sort(key=lambda tup: tup[0]) hash_time_list.sort(key=lambda tup: tup[0]) graph_node_time_list.sort(key=lambda tup: tup[0]) hash_node_time_list.sort(key=lambda tup: tup[0]) # save time consumption f = open(os.path.join(args.directory,'time_evaluation'), 'w') f.write('max file size:%f\n' % (max_file_size/(10**6)) ) f.write('min file size:%f\n' % (min_file_size/(10**6)) ) f.write('max node count:%d\n' % max_node_count) f.write('min node count:%d\n' % min_node_count) f.write('max graph time:%f\n' % max_graph_time) f.write('min graph time:%f\n' % min_graph_time) f.write('max hash time:%f\n' % max_hash_time) f.write('min hash time:%f\n' % min_hash_time) f.write('graph generation(size):\n') for gtc in graph_time_list: f.write(str(gtc)) f.write('\nneighborhood hash(size):\n') for htc in hash_time_list: f.write(str(htc)) f.write('\ngraph generation(node):\n') for gntc in graph_node_time_list: f.write(str(gntc)) f.write('\nneighborhood hash(node):\n') for hntc in hash_node_time_list: f.write(str(hntc)) f.close() else: parser.print_help()