def dump_indexed_strings_freq_distribution(): try: # register signal handler signal = utils.Signal() signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM]) # iterate over all keys # for key in redis.keys(): for key in redis.scan_iter(): # check for interruption if signal.caught(): logger.error("Interrupted") return None # zset datatype for hashed strings if redis.type(key) == "zset": logger.info("%s,%d,", binascii.hexlify(key), redis.zcard(key)) except Exception as e: logger.error("Error dumping freq distribution of indexed strings %s", str(e))
def dump_repo_signatures(): try: # register signal handler signal = utils.Signal() signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM]) # iterate over all keys for key in redis.keys(): # check for interruption if signal.caught(): logger.error("Interrupted") return None # zset datatype for hashed strings if redis.type(key) == "zset": logger.info("%s,%s,", binascii.hexlify(key), redis.get(key)) # elif redis.type(key) == "hset": else: print type(redis.type(key)), len(str(redis.type(key))) logger.info("%s -> %s,", key, redis.hgetall(key)) except Exception as e: logger.error("Error dumping indexed strings %s", str(e))
def dump_indexed_strings(): try: # register signal handler signal = utils.Signal() signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM]) # iterate over all keys mapping = {} revmapping = {} roots = [] for key in redis.keys(): # check for interruption if signal.caught(): logger.error("Interrupted") return None # zset datatype for hashed strings if redis.type(key) == "zset": key = binascii.hexlify(key) values = redis.zrange(key, 0, -1, withscores=False) logger.info("%s,%s,", key, values) elif redis.type(key) == "set": values = redis.smembers(key) logger.info("%s -> %s", key, values) elif redis.type(key) == "hash" and '_' in key: values = redis.hgetall(key) logger.info("%s -> %s", key, values) elif (redis.type(key) == "hash" and '-' in key and key.split('-', 1)[0] in ['str', 'func', 'file', 'dir', 'branch', 'repo']): values = redis.hgetall(key) logger.info("%s -> %s", key, values) from common import skip_set for h, c in values.items(): # skip special purpose item if h in skip_set or h == key: continue if len(str(h)) < 10: roots.append(h) if not key in mapping: mapping[key] = [] mapping[key].append(h) if not h in revmapping: revmapping[h] = [] if not key in revmapping[h]: revmapping[h].append(key) elif not redis.type(key) == "hash" and len(str(key)) < 15: values = redis.get(key) logger.info("%s -> %s", key, values) # else: # logger.info("%s type %s", key, redis.type(key)) if revmapping and mapping: for root in roots: dump_tree(root, 0, mapping, revmapping) except Exception as e: logger.error("Error dumping indexed strings %s", str(e))
def dump_indexed_unique_strings_repo_distribution(): try: import matplotlib.pyplot as plt import numpy as np # register signal handler signal = utils.Signal() signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM]) repo_all_strs = dict() repo_unq_strs = dict() # iterate over all keys for key in redis.keys(): # check for interruption if signal.caught(): logger.error("Interrupted") return None # zset datatype for hashed strings if redis.type(key) == "zset": for repo_id in redis.zrange(key, 0, -1, withscores=False): if repo_id in repo_all_strs: repo_all_strs[repo_id] += 1 else: repo_all_strs[repo_id] = 1 if redis.zcard(key) == 1: if repo_id in repo_unq_strs: repo_unq_strs[repo_id] += 1 else: repo_unq_strs[repo_id] = 1 # format: repo_id, num_strs, num_uniq_strs, ratio_uniq_all_strs for repo_id, count in repo_all_strs.iteritems(): try: ratio = float(repo_unq_strs[repo_id]) / count logger.info("%s,%d,%d,%0.2f", repo_id, count, \ repo_unq_strs[repo_id], ratio) repo_all_strs[repo_id] = ratio except Exception as e: logger.error("%s,%s", repo_id, str(e)) plt.hist(repo_all_strs.values(), bins=50) # np.logspace(1, 1000000, 100)) plt.gca().set_xscale('log') plt.title("Unique/total strings across all repos") plt.xlabel('# Strings') plt.ylabel('# Repos') # pyplot.grid(True) plt.legend() plt.savefig('strings_hist', format='pdf') except ImportError as ie: logger.error("Error importing required modules: %s", str(e)) except Exception as e: logger.error("Error dumping stats of indexed strings per repo %s", str(e))
def run_counter(main, argv): global logger, stats_logger logger = main.logger stats_logger = main.stats_logger if not len(argv) == 1: logger.error('expects args: $feature_csv_list, but get: %s', argv) exit(1) input_path = argv[0] if not os.path.exists(input_path): logger.error("%s does not exist", input_path) exit(1) redis_rrc = main.rrc if not redis_rrc or not redis_rrc.handle(): logger.error("redis rrc not available, exiting!") exit(1) input_list = get_input_list(main=main, input_list_file=input_path, skip_input_callback=skip_input) # deduplicate! input_list = list(set(input_list)) # start extracting if input_list: # track progress count = len(input_list) logger.info("Counting %d feature files", count) # register signal handler signal = utils.Signal() signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM]) pb = utils.Progressbar("Counting features: ", count) pb.start() if main.QUEUING and main.QUEUING == "Celery": from celery import group from celery_tasks import feature_counter_worker # group jobs job = group( feature_counter_worker.s(infile) for infile in input_list) result = job.apply_async() # track worker progress completed = 0 while (result.waiting()): completed += result.completed_count() if completed < count: pb.update(completed) time.sleep(2) else: # non-parallel instance count = 0 # scan loop for infile in input_list: # check for interruption if signal.caught(): break if count_features(main, infile): count += 1 # update progressbar pb.update(count) if not signal.caught(): pb.finish()
def run_searcher(main, argv): global logger, stats_logger logger = main.logger stats_logger = main.stats_logger searching.logger = main.logger searching.stats_logger = main.stats_logger searching_java.logger = main.logger searching_java.stats_logger = main.stats_logger if len(argv) != 2: logger.error('expects two args') exit(1) # if we are just testing this repo if argv[0] == 'dump': main.TEST_REPO = True # check if redis is populated ndbsize, ndbval = main.nrc.dbsize() jdbsize, jdbval = main.jrc.dbsize() rdbsize, rdbval = main.rrc.dbsize() if ndbsize == 0 or jdbsize == 0: logger.error("Nothing is indexed in native or java redis db (ndbsize: %s, jdbsize: %s, rdbsize: %s)! Exiting.", ndbsize, jdbsize, rdbsize) exit(1) # check if path exists input_path = argv[1] if not os.path.exists(input_path): logger.error('%s does not exist', input_path) exit(1) apk_list = get_input_list(main=main, redis=main.rrc, redis_pipe=main.rrc.pipeline(), input_path=input_path, input_type="apk", skip_scanned=True, skip_failure=True) print ("There are %d input to be searched" % len(apk_list)) # start searching if apk_list: # register signal handler signal = utils.Signal() signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM]) # track progress count = len(apk_list) logger.info("Searching %d applications", count) pb = utils.Progressbar('Matching libs: ', count) pb.start() # if requested parallelism if main.QUEUING and main.QUEUING == "Celery": from celery import group from celery_tasks import search_apk_worker # group jobs job = group(search_apk_worker.s(app_path) for app_path in apk_list) result = job.apply_async() # track worker progress completed = 0 while result.waiting(): time.sleep(5) completed += result.completed_count() if completed < count: pb.update(completed) # all done pb.finish() result.get() else: # non-parallel instance # search loop count = 0 for app_path in apk_list: # check for interruption if signal.caught(): break # lookup apk search_apk(main, app_path) # update progressbar count += 1 pb.update(count) # all done if not signal.caught() and pb: pb.finish() else: logger.error("No apk(s) to search")
def run_signature(main, argv): global logger, stats_logger logger = main.logger stats_logger = main.stats_logger # the outer args if len(argv) != 2: logger.error('expects two args') exit(1) if argv[0] == 'dump': main.TEST_REPO = True # the inner args argv = argv[1] if len(argv) < 1 or len(argv) > 2: logger.error('expects args: $input_path [$input_type] [-d]') exit(1) input_path = argv[0] input_type = argv[1] if len(argv) == 2 else 'jar' if not os.path.exists(input_path): logger.error('%s does not exist', input_path) exit(1) input_list = get_input_list(main=main, redis=None, redis_pipe=None, input_path=input_path, input_type=input_type, path_as_id=True, skip_scanned=False, skip_signatured=True, skip_failure=True) print("There are %d input to be signatured" % len(input_list)) # start signature # query the database if input_list: # register signal handler signal = utils.Signal() signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM]) # track progress count = len(input_list) logger.info("Matching %d libraries/applications", count) # if requested parallelism if main.QUEUING and main.QUEUING == 'Celery': from celery import group from celery_tasks import signature_java_worker # group jobs input_count = len(input_list) for index in range(0, input_count, JOB_CHUNK): tmp_input_list = input_list[index:min(index + JOB_CHUNK, input_count)] if index + JOB_CHUNK > input_count: logger.info("Processing the %d %d input" % (index / JOB_CHUNK + 1, input_count - index)) else: logger.info("Processing the %d %d input" % (index / JOB_CHUNK + 1, JOB_CHUNK)) job = group( signature_java_worker.s(item, input_type) for item in tmp_input_list) result = job.apply_async() try: result.get() except Exception as e: logger.error("Error signaturing jobs: %s", str(e)) else: # non-parallel instance pb = utils.Progressbar('Matching libs/apps: ', count) pb.start() count = 0 for item in input_list: # check for interruption if signal.caught(): break if main.TEST_REPO: pb.msg('Testing {0} '.format(item)) else: pb.msg('Signaturing {0} '.format(item)) # signature libs/apps signature_classes(main=main, input_path=item, input_type=input_type) # update progressbar count += 1 pb.update(count) # all done if not signal.caught() and pb: pb.finish() else: logger.error("No lib(s) to signature")
def run_validator(main, argv): global logger, stats_logger logger = main.logger stats_logger = main.stats_logger if not len(argv) == 1: logger.error('expectes args: $apks_to_validate, but get: %s', argv) exit(1) input_path = argv[0] if not exists(input_path): logger.error("%s does not exist", input_path) exit(1) input_list = get_input_list(main=main, redis=main.rrc.handle(), redis_pipe=main.rrc.pipeline(), input_path=input_path, path_as_id=True, skip_scanned=main.ignore_scanned) # deduplicate! input_list = list(set(input_list)) # start crawling if input_list: # track progress count = len(input_list) logger.info("Validating %d applications", count) # register signal handler signal = utils.Signal() signal.install([utils.Signal.SIGINT, utils.Signal.SIGTERM]) pb = utils.Progressbar('Validating applications: ', count) pb.start() if main.QUEUING and main.QUEUING == "Celery": from celery import group from celery_tasks import validate_worker # group jobs job = group(validate_worker.s(app_path) for app_path in input_list) result = job.apply_async() # track worker progress completed = 0 while (result.waiting()): completed += result.completed_count() if completed < count: pb.update(completed) time.sleep(2) else: # non-parallel instance count = 0 # scan loop for app_path in input_list: # check for interruption if signal.caught(): break if validate_apk(main=main, app_path=app_path): count += 1 # update progressbar pb.update(count) if not signal.caught(): pb.finish()