Пример #1
0
def get_categories(directory):
    '''get category dict'''
    # progressbar
    file_count = count_file(directory, HCG_FILE_NAME)
    pbar = use_progressbar('Get categories...', file_count)
    pbar.start()
    progress = 0

    category_dict = dict()
    category_index = 0
    for parent, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename == HCG_FILE_NAME:
                category = os.path.basename(
                    os.path.split(os.path.split(parent)[0])[0])
                if category not in category_dict:
                    category_dict[category] = category_index
                    category_index += 1

                # progressbar
                progress += 1
                pbar.update(progress)

    # progressbar
    pbar.finish()

    return category_dict
Пример #2
0
def remove_emtpy_hcg(directory):
    # progressbar
    file_count = count_file(directory, HCG_FILE_NAME)
    pbar = use_progressbar('Removing empty hcgs...', file_count)
    pbar.start()
    progress = 0

    emtpy_hcg_list = []
    count = 0
    for parent, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename == HCG_FILE_NAME:
                hcg_file = os.path.join(parent, filename)
                statinfo = os.stat(hcg_file)
                hcg_size = statinfo.st_size
                if hcg_size <= 2:
                    emtpy_hcg_list.append(hcg_file)
                    count += 1
                    os.remove(hcg_file)

                # progressbar
                progress += 1
                pbar.update(progress)

    # progressbar
    pbar.finish()

    print '[SC] Removed %d empty hcgs' % count
    for empty_hcg in emtpy_hcg_list:
        print empty_hcg
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--directory', help='directory of the apk')
    args = parser.parse_args()
    if args.directory:
        # progressbar
        file_count = count_file(args.directory, '.apk')
        pbar = use_progressbar('networkxifying call graph...', file_count)
        pbar.start()
        progress = 0

        for parent, dirnames, filenames in os.walk(args.directory):
            for filename in filenames:
                if filename.endswith('.apk'):
                    # print(os.path.join(parent, filename))
                    cg, graphdir = generate(os.path.join(parent, filename))
                    fcg = networkxify(cg)
                    h = os.path.splitext(filename)[0]
                    fnx = os.path.join(graphdir, "{}.pz".format(h))
                    pz.save(fcg, os.path.join(graphdir, fnx))

                    # progressbar
                    progress += 1
                    pbar.update(progress)

        # progressbar
        pbar.finish()
    else:
        parser.print_help()
def count(directory):
    '''count all the indivisual hash values'''
    # 1. iterate through all the hcg.json files
    # 2. get hash values from a hcg.json file
    # 3. merge the hash values into one file

    # progressbar
    file_count = count_file(directory, HCG_FILE_NAME)
    pbar = use_progressbar('Calculating maximum occurrence', file_count)
    pbar.start()
    progress = 0

    hash_dict = dict()
    for parent, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename == HCG_FILE_NAME:
            # if filename == 'hcg.json':
                hash_dict = merge_hash_dict(get_hash(os.path.join(parent, filename)), hash_dict)

                # progressbar
                progress += 1
                pbar.update(progress)

    # progressbar
    pbar.finish()

    return hash_dict
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--directory', help='directory of the apk')
    args = parser.parse_args()

    if args.directory:
        # progressbar
        file_count = count_file(args.directory, HCG_FILE_NAME)
        pbar = use_progressbar('double hashing...', file_count)
        pbar.start()
        progress = 0

        for parent, dirnames, filenames in os.walk(args.directory):
            for filename in filenames:
                if filename == HCG_FILE_NAME:
                    graphdir = parent
                    hcg = read_hashed_call_graph(os.path.join(
                        parent, filename))
                    for node in hcg:
                        hcg[node]['label'] = hcg[node]['nhash']
                    double_hcg = neighborhood_hash(hcg, graphdir)
                    save_to_file(double_hcg, graphdir)

                    # progressbar
                    progress += 1
                    pbar.update(progress)

        # progressbar
        pbar.finish()
    else:
        parser.print_help()
Пример #6
0
def embed_all(directory):
    '''iteratively embed all the hashed call graph into a sparse matrix'''

    # 1. get category dict
    category_dict = get_categories(directory)

    # progressbar
    file_count = count_file(directory, HCG_FILE_NAME)
    pbar = use_progressbar('Computing label histogram...', file_count)
    pbar.start()
    progress = 0

    # 2. iteratively embed all the hashed call graph into matrix
    #    the label of each hashed call graph stored in truth_label
    #    record filenames in filename_list also
    matrix = []
    truth_label = np.array([])
    category_label = np.array([])
    filename_list = []
    for parent, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if filename == HCG_FILE_NAME:
                category = os.path.basename(
                    os.path.split(os.path.split(parent)[0])[0])
                truth_label = np.append(truth_label, category_dict[category])
                category_label = np.append(category_label, category)
                hcg = read_hashed_call_graph(os.path.join(parent, filename))
                x_i = compute_label_histogram(hcg)
                matrix.append(x_i)
                filename_list.append(os.path.split(parent)[1])

                # progressbar
                progress += 1
                pbar.update(progress)

    # progressbar
    pbar.finish()

    # 3. convert matrix to binary
    print '[SC] Converting python list to numpy matrix...'
    matrix = np.array(matrix, dtype=np.int16)
    save_as_arff(matrix, category_label)
    # print '[SC] Converting features vectors to binary...'
    # matrix, m = ml.make_binary(matrix)
    m = 0

    return matrix, m, truth_label, filename_list
Пример #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--directory', help='directory of the apk')
    args = parser.parse_args()
    if args.directory:
        # progressbar
        file_count = count_file(args.directory, '.apk')
        pbar = use_progressbar('Generating hashed call graph...', file_count)
        pbar.start()
        progress = 0

        graph_time_list = []
        hash_time_list = []
        graph_node_time_list = []
        hash_node_time_list = []
        min_file_size = sys.maxint
        max_file_size = 0
        min_node_count = sys.maxint
        max_node_count = 0
        min_graph_time = sys.maxint
        max_graph_time = 0
        min_hash_time = sys.maxint
        max_hash_time = 0
        for parent, dirnames, filenames in os.walk(args.directory):
            for filename in filenames:
                if filename.endswith('.apk'):
                    apk_file = os.path.join(parent, filename)
                    # check file size
                    file_size = os.stat(apk_file).st_size

                    # graph generation and neighborhood hash
                    start_time = time.time()
                    cg, graphdir = generate(apk_file)
                    graph_time = time.time() - start_time

                    start_time = time.time()
                    hash_cg = neighborhood_hash(cg, graphdir)
                    hash_time = time.time() - start_time

                    save_to_file(hash_cg, graphdir)

                    graph_time_coordinate = (file_size/(10**6), graph_time)
                    hash_time_coordinate = (file_size/(10**6), hash_time)
                    graph_node_time_coordinate = (len(cg), graph_time)
                    hash_node_time_coordinate = (len(cg), hash_time)
                    graph_time_list.append(graph_time_coordinate)
                    hash_time_list.append(hash_time_coordinate)
                    graph_node_time_list.append(graph_node_time_coordinate)
                    hash_node_time_list.append(hash_node_time_coordinate)

                    if file_size > max_file_size:
                        max_file_size = file_size
                    if len(cg) > max_node_count:
                        max_node_count = len(cg)
                    if file_size < min_file_size:
                        min_file_size = file_size
                    if len(cg) < min_node_count:
                        min_node_count = len(cg)
                    if graph_time > max_graph_time:
                        max_graph_time = graph_time
                    if graph_time < min_graph_time:
                        min_graph_time = graph_time
                    if hash_time > max_hash_time:
                        max_hash_time = hash_time
                    if hash_time < min_hash_time:
                        min_hash_time = hash_time

                    # progressbar
                    progress += 1
                    pbar.update(progress)

        # progressbar
        pbar.finish()

        # sort list
        graph_time_list.sort(key=lambda tup: tup[0])
        hash_time_list.sort(key=lambda tup: tup[0])
        graph_node_time_list.sort(key=lambda tup: tup[0])
        hash_node_time_list.sort(key=lambda tup: tup[0])

        # save time consumption
        f = open(os.path.join(args.directory,'time_evaluation'), 'w')
        f.write('max file size:%f\n' % (max_file_size/(10**6)) )
        f.write('min file size:%f\n' % (min_file_size/(10**6)) )
        f.write('max node count:%d\n' % max_node_count)
        f.write('min node count:%d\n' % min_node_count)
        f.write('max graph time:%f\n' % max_graph_time)
        f.write('min graph time:%f\n' % min_graph_time)
        f.write('max hash time:%f\n' % max_hash_time)
        f.write('min hash time:%f\n' % min_hash_time)
        f.write('graph generation(size):\n')
        for gtc in graph_time_list:
            f.write(str(gtc))

        f.write('\nneighborhood hash(size):\n')
        for htc in hash_time_list:
            f.write(str(htc))

        f.write('\ngraph generation(node):\n')
        for gntc in graph_node_time_list:
            f.write(str(gntc))

        f.write('\nneighborhood hash(node):\n')
        for hntc in hash_node_time_list:
            f.write(str(hntc))
        f.close()
    else:
        parser.print_help()