def reformat(apis_file, all_sources, all_sinks, json_result_file, outfile): try: results = json.load(open(json_result_file, 'r')) except Exception as e: logging.error("failed to load progpilot results in json: %s", json_result_file) return None logging.warning("there are %d sources and %d sinks checked!", len(all_sources), len(all_sinks)) # load the astgen config from file config = AstLookupConfig() read_proto_from_file(config, apis_file, binary=False) logging.warning("loaded config with %d apis to check!", len(config.apis)) result = ModuleResult() set_result(result=result, apis=config.apis, all_sources=all_sources, all_sinks=all_sinks, flows=results) summary = ModuleSummary() set_summary(summary=summary, apis=config.apis, all_sources=all_sources, all_sinks=all_sinks, new_sources=None, new_sinks=None) static = ModuleStatic() static.flows.MergeFrom(result.flows) static.dangers.MergeFrom(result.dangers) static.sources.MergeFrom(summary.sources) static.sinks.MergeFrom(summary.sinks) static.taint_wrappers.MergeFrom(summary.taint_wrappers) write_proto_to_file(proto=static, filename=outfile, binary=False)
def run_extractor_worker(infile, outdir, extract_types=['SO'], in_type='APK', store_type="file_with_symlink'", skip_processed=False, binary=False): # 1. extract the dex/so files, and store them to outdir. For each file, create a symbolic link using hash # (used for deduplication) # 2. also store the ".components" file to the outdir logging.info("Processing %s" % infile) if skip_processed and exists(join(outdir, infile + COMPONENTS_SUFFIX)): logging.info("Skipping processed infile %s", infile) return extract_config = repo_pb.ExtractConfig() for extract_type in extract_types: extract_config.extract_types.append(getattr(repo_pb, extract_type)) file_digest = hashfile(open(infile, 'rb'), hashlib.sha1()) extract_config.inspect_compressed_files = True extract_config.in_path = infile.encode('utf8') if isinstance(infile, unicode) else infile extract_config.in_digest = file_digest extract_config.store_type = store_type extract_config.in_type = getattr(repo_pb, in_type) extract_config.out_path = outdir.encode('utf8') if isinstance(outdir, unicode) else outdir # extract types of files from infile, use symbol links for deduplication! extract_from_file_or_repo(extract_config=extract_config) outfile = join(extract_config.out_path, basename((extract_config.in_path)) + COMPONENTS_SUFFIX) write_proto_to_file(proto=extract_config, filename=outfile, binary=binary) logging.info("extracted %d components from %s, and saved output to %s", len(extract_config.components), infile, outfile) return outfile
def astgen(self, inpath, outfile, root=None, configpath=None, pkg_name=None, pkg_version=None, evaluate_smt=False): analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args( inpath=inpath, outfile=outfile, root=root, configpath=configpath, language=self.language) astgen_cmd = ['java', '-jar', 'target/astgen-java-1.0.0-jar-with-dependencies.jar', '-inpath', analyze_path, '-outfile', outfile, '-config', configpath] if isdir(analyze_path): raise Exception("Soot doesn't take a directory as input: %s", analyze_path) if analyze_path.endswith((".apk", ".dex")): # processing android apps requires android.jar astgen_cmd.extend(['-android_jar_dir', 'platforms/']) if analyze_path.endswith(".apk"): astgen_cmd.extend(['-intype', 'APK' '-process_dir', analyze_path]) elif analyze_path.endswith(".dex"): astgen_cmd.extend(['-intype', 'DEX', '-process_dir', analyze_path]) elif analyze_path.endswith((".java",)): astgen_cmd.extend(['-intype', 'SOURCE', '-process_dir', dirname(analyze_path)]) elif analyze_path.endswith((".class",)): astgen_cmd.extend(['-intype', 'CLASS', '-process_dir', dirname(analyze_path)]) elif analyze_path.endswith((".jar",)): # this is the default input type astgen_cmd.extend(['-intype', 'JAR', '-process_dir', analyze_path]) elif analyze_path.endswith((".aar",)): # aar contains /classes.jar # https://developer.android.com/studio/projects/android-library astgen_cmd.extend(['-android_jar_dir', 'platforms/']) aar_file = get_file_with_meta(analyze_path) class_jar_content = aar_file.accessor.read('classes.jar') analyze_path_jar = join(dirname(analyze_path), splitext(basename(analyze_path))[0] + '.jar') open(analyze_path_jar, 'wb').write(class_jar_content) astgen_cmd.extend(['-intype', 'JAR', '-process_dir', analyze_path_jar]) elif analyze_path.endswith((".war",)): # war contains lots of jar files in /WEB-INF/lib/ # http://one-jar.sourceforge.net/ logging.error("Not handling .war file yet: %s", analyze_path) else: logging.error("Input path has unexpected suffix: %s", analyze_path) # root is not used here if pkg_name is not None: astgen_cmd.extend(['-package_name', pkg_name]) if pkg_version is not None: astgen_cmd.extend(['-package_version', pkg_version]) exec_command("java astgen", astgen_cmd, cwd="static_proxy/astgen-java") # optionally evaluate smt formula if evaluate_smt: resultpb = PkgAstResults() read_proto_from_file(resultpb, filename=outfile, binary=False) satisfied = self._check_smt(astgen_results=[resultpb], configpath=configpath) resultpb.pkgs[0].config.smt_satisfied = satisfied write_proto_to_file(resultpb, filename=outfile, binary=False) # clean up residues self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)
def astgen(self, inpath, outfile, root=None, configpath=None, pkg_name=None, pkg_version=None, evaluate_smt=False): analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args( inpath=inpath, outfile=outfile, root=root, configpath=configpath, language=self.language) # ./vendor/nikic/php-parser/bin/php-parse -d ../testdata/test-eval-exec.php configpb = AstLookupConfig() configpath_bin = configpath + '.bin' # create binary config from text format self._pb_text_to_bin(proto=configpb, infile=configpath, outfile=configpath_bin) astgen_cmd = [ 'php', 'astgen.php', '-c', configpath_bin, '-i', analyze_path, '-o', outfile ] if root is not None: astgen_cmd.extend(['-b', root]) if pkg_name is not None: astgen_cmd.extend(['-n', pkg_name]) if pkg_version is not None: astgen_cmd.extend(['-v', pkg_version]) exec_command("php astgen", astgen_cmd, cwd="static_proxy") # convert binary output to text format resultpb = PkgAstResults() read_proto_from_file(resultpb, filename=outfile, binary=True) # optionally evaluate smt formula if evaluate_smt: satisfied = self._check_smt(astgen_results=[resultpb], configpath=configpath) resultpb.pkgs[0].config.smt_satisfied = satisfied # save resultpb write_proto_to_file(resultpb, filename=outfile, binary=False) # clean up residues self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)
def py3_astgen(inpath, outfile, configpb, root=None, pkg_name=None, pkg_version=None): # get input files infiles, root = get_infiles(inpath=inpath, root=root) # initialize resultpb resultpb = PkgAstResults() pkg = resultpb.pkgs.add() pkg.config.CopyFrom(configpb) pkg.pkg_name = pkg_name if pkg_name is not None else basename(inpath) if pkg_version is not None: pkg.pkg_version = pkg_version pkg.language = ast_pb2.PYTHON for infile in infiles: all_source = open(infile, 'r').read() try: tree = ast.parse(all_source, filename=infile) except SyntaxError as se: logging.warning("Syntax error %s parsing file %s in python2!", se, infile) raise se # mark the tree with tokens information asttok = asttokens.ASTTokens(source_text=all_source, tree=tree, filename=infile) visitor = PythonDeclRefVisitor(asttok=asttok, configpb=configpb) visitor.visit(tree) logging.warning("collected functions: %s", Counter(visitor.get_declrefs()).items()) filepb = get_filepb(infile, root) for base, name, args, source_text, source_range in visitor.get_declrefs( ): api_result = get_api_result(base, name, args, source_text, source_range, filepb) pkg.api_results.add().CopyFrom(api_result) # save resultpb write_proto_to_file(resultpb, outfile, binary=False)
def taint(self, inpath, outfile, configpath=None, pkg_name=None, pkg_version=None): analyze_path, is_decompress_path, outfile, _, configpath = self._sanitize_astgen_args( inpath=inpath, outfile=outfile, root=None, configpath=configpath, language=self.language) # convert the config to binary configpb = AstLookupConfig() configpath_bin = configpath + '.bin' # create binary config from text format self._pb_text_to_bin(proto=configpb, infile=configpath, outfile=configpath_bin) # perform static taint analysis taint_cmd = [ 'node', 'jsprime_wrapper.js', pkg_name, analyze_path, configpath_bin, outfile ] exec_command("javascript taint", taint_cmd, cwd="static_proxy/jsprime") pkg_static = ModuleStatic() read_proto_from_file(pkg_static, outfile, binary=True) logging.warning("taint analysis results: %s", pkg_static) # save resultpb write_proto_to_file(pkg_static, filename=outfile, binary=False) # clean up residues os.remove(configpath_bin) self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)
def _gen_combined_configpath(self, configpath, dep_taint_results): # load the old config configpb = AstLookupConfig() read_proto_from_file(configpb, configpath, binary=False) # iterate through the taint results to update configpb num_new_sources = 0 num_new_sinks = 0 for dep_taint_result in dep_taint_results: # dep_taint_result is of type module_pb2.ModuleStatic for new_source in dep_taint_result.sources: configpb.apis.append(new_source.node) num_new_sources += 1 for new_sink in dep_taint_result.sinks: configpb.apis.append(new_sink.node) num_new_sinks += 1 if num_new_sources + num_new_sinks > 0: logging.warning("added %d new sources and %d new sinks!", num_new_sources, num_new_sinks) # generate the new config file outf = tempfile.NamedTemporaryFile(prefix='configpath-', delete=False) write_proto_to_file(proto=configpb, filename=outf.name, binary=False) return outf.name
def secure_extract_from_file_or_repo(tuple_input): inpath, app_digest_set, component_digest_set, new_component_digest_set = tuple_input """A wrapper around extract_from_file_or_repo, simply configures the extract configuration :param inpath: input path specified by command line :param app_digest_set: all the app digest :param component_digest_set: all the component digest """ msg = "Processing %s" % inpath logging.info(msg) print(msg) extract_config = repo_pb.ExtractConfig() for extract_type in FLAGS.extract_types: extract_config.extract_types.append(getattr(repo_pb, extract_type)) if FLAGS.dbfile: extract_config.db_path = FLAGS.dbfile file_digest = hashfile(open(inpath, 'rb'), hashlib.sha1()) if os.path.isfile(inpath) else None extract_config.inspect_compressed_files = True extract_config.in_path = inpath.encode('utf8') if isinstance(inpath, unicode) else inpath extract_config.in_digest = file_digest extract_config.store_type = FLAGS.store_type # sub_in_type is the input type for workers extract_config.in_type = getattr(repo_pb, FLAGS.sub_in_type) # outpath is generic for all files, and files are named with digests extract_config.out_path = FLAGS.outdir.encode('utf8') if isinstance(FLAGS.outdir, unicode) else FLAGS.outdir if extract_config.store_type == 'database': # Store using database. raise Exception("deprecated") # extract_config.summary_table_name = FLAGS.summary_table # extract_config.detail_table_name = FLAGS.detail_table # db_obj_for_summary = SQLiteDatabase(dbpath=FLAGS.dbfile) # processed = db_obj_for_summary.exists_table(table_name=FLAGS.summary_table, # where_name_value_dict={'app_digest': file_digest, 'processed': 1}) # if not processed: # try: # extract_from_file_or_repo(extract_config) # db_obj_for_summary.update_table(table_name=FLAGS.summary_table, # # TODO: sharedlib_count is non-trivial to get, skipping for now! # set_name_value_dict={'processed': 1}, # where_name_value_dict={'app_digest': file_digest}) # except Exception as e: # db_obj_for_summary.update_table(table_name=FLAGS.summary_table, # # 0, non-processed, 1, processed, -1, error processing # set_name_value_dict={'processed': -1}, # where_name_value_dict={'app_digest': file_digest}) # msg = "Error processing %s: %s" % (inpath, e) # logging.error(msg) # print (msg) elif extract_config.store_type == 'file': if file_digest not in app_digest_set: # not processed extract_from_file_or_repo(extract_config=extract_config, component_digest_set=component_digest_set, new_component_digest_set=new_component_digest_set, summarize_size=FLAGS.summarize_size) app_digest_set[file_digest] = True if len(extract_config.components) > 0: # Something was extracted write_proto_to_file(proto=extract_config, filename=os.path.join( extract_config.out_path, basename((extract_config.in_path)) + COMPONENTS_SUFFIX), binary=False) logging.debug("%s write analyzed components to file successful, extract_config:\n%s" % ( extract_config.in_path, extract_config)) else: logging.info("No extracted components:%s" % extract_config) else: logging.info("Skipping processed item: %s" % inpath) else: # file_with_symlink is not used here! raise Exception("Unhandled store type")
def astfilter(self, pkg_name, outdir, cache_dir=None, configpath=None, pkg_version=None, pkg_manager=None, ignore_dep_version=False, ignore_dep=False): """ Filters packages and their dependencies, based on sensitive APIs and their combinations This helps narrow down packages for further analysis. """ # sanitize language if self.language is None: raise Exception("Invoking astfilter on invalid language: %s" % self.language) if pkg_manager is None: pm_proxy = get_pm_proxy_for_language(language=self.language, cache_dir=cache_dir, isolate_pkg_info=True) else: pm_proxy = get_pm_proxy(pm=pkg_manager, cache_dir=cache_dir, isolate_pkg_info=True) # check for cached astfilter file astfilter_fname = pm_proxy.get_astfilter_fname(pkg_name=pkg_name, pkg_version=pkg_version) astfilter_file = join(outdir, astfilter_fname) if exists(astfilter_file): logging.warning("skipping cached astfilter_file %s!", astfilter_file) return # get the astgen results for the main package as well as its dependent packages astgen_results = [] main_astgen_result = self.get_astgen_result(pm_proxy=pm_proxy, pkg_name=pkg_name, outdir=outdir, configpath=configpath, pkg_version=pkg_version) if main_astgen_result: astgen_results.append(main_astgen_result) else: logging.error("fail to run astfilter on pkg %s ver %s", pkg_name, pkg_version) return # get flattened dependencies and their astgen results if not ignore_dep: try: flatten_dep_pkgs = pm_proxy.get_dep(pkg_name=pkg_name, pkg_version=pkg_version, flatten=True) except Exception as gde: logging.error("fail to get_dep on pkg %s ver %s: %s", pkg_name, pkg_version, gde) return for dep_name, dep_version in flatten_dep_pkgs.items(): if ignore_dep_version: dep_version = None dep_astgen_result = self.get_astgen_result( pm_proxy=pm_proxy, pkg_name=dep_name, outdir=outdir, configpath=configpath, pkg_version=dep_version) if dep_astgen_result: astgen_results.append(dep_astgen_result) # check satisfiability of the specified smt formula and dump the corresponding output satisfied = StaticAnalyzer._check_smt(astgen_results=astgen_results, configpath=configpath) main_astgen_result.pkgs[0].config.smt_satisfied = satisfied # TODO: maybe record the suspicious API usage in each dependent package as well # dump the astfilter result to file write_proto_to_file(proto=main_astgen_result, filename=astfilter_file, binary=False)
def _pb_text_to_bin(proto, infile, outfile): read_proto_from_file(proto=proto, filename=infile, binary=False) write_proto_to_file(proto=proto, filename=outfile, binary=True)
def astgen(self, inpath, outfile, root=None, configpath=None, pkg_name=None, pkg_version=None, evaluate_smt=False): analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args( inpath=inpath, outfile=outfile, root=root, configpath=configpath, language=self.language) # try python2 try: # load the config proto configpb = AstLookupConfig() read_proto_from_file(configpb, configpath, binary=False) logging.debug("loaded lookup config from %s:\n%s", configpath, configpb) # invoke the language specific ast generators to call functions # get input files infiles, root = self._get_infiles(inpath=analyze_path, root=root, language=self.language) # initialize resultpb resultpb = PkgAstResults() pkg = resultpb.pkgs.add() pkg.config.CopyFrom(configpb) pkg.pkg_name = pkg_name if pkg_name is not None else basename(analyze_path) if pkg_version is not None: pkg.pkg_version = pkg_version pkg.language = ast_pb2.PYTHON for infile in infiles: all_source = open(infile, 'r').read() try: tree = ast.parse(all_source, filename=infile) except SyntaxError as se: logging.warning("Syntax error %s parsing file %s in python2!", se, infile) raise se # mark the tree with tokens information asttok = asttokens.ASTTokens(source_text=all_source, tree=tree, filename=infile) visitor = PythonDeclRefVisitor(asttok=asttok, configpb=configpb) visitor.visit(tree) logging.warning("collected functions: %s", Counter(visitor.get_declrefs()).items()) filepb = self._get_filepb(infile, root) for base, name, args, source_text, source_range in visitor.get_declrefs(): api_result = self._get_api_result(base, name, args, source_text, source_range, filepb) pkg.api_results.add().CopyFrom(api_result) # save resultpb write_proto_to_file(resultpb, outfile, binary=False) # try python3 except SyntaxError as se: logging.error("Syntax error %s, now trying to parse %s again in python3!", se, analyze_path) astgen_py3_cmd = ['python3', 'astgen_py3.py', analyze_path, outfile, '-c', configpath] if root is not None: astgen_py3_cmd.extend(['-b', root]) if pkg_name is not None: astgen_py3_cmd.extend(['-n', pkg_name]) if pkg_version is not None: astgen_py3_cmd.extend(['-v', pkg_version]) exec_command("python3 astgen", astgen_py3_cmd, cwd="static_proxy") except Exception as e: logging.error("Fatal error %s running astgen for %s!", e, analyze_path) # optionally evaluate smt formula if evaluate_smt: resultpb = PkgAstResults() read_proto_from_file(resultpb, filename=outfile, binary=False) satisfied = self._check_smt(astgen_results=[resultpb], configpath=configpath) resultpb.pkgs[0].config.smt_satisfied = satisfied write_proto_to_file(resultpb, filename=outfile, binary=False) # clean up residues self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)
def reformat(apis_file, json_result_file, outfile): try: results = json.load(open(json_result_file, 'r')) except Exception as e: logging.error("failed to load pyt results in json: %s", json_result_file) return None # load the astgen config from file config = AstLookupConfig() read_proto_from_file(config, apis_file, binary=False) logging.warning("loaded config with %d apis to check!", len(config.apis)) # convert list of apis into dictionary with key=id, value=full_name for easier identification source_dict = {} sink_dict = {} for entry in config.apis: # FIXME: should we support func_only mode if entry.functionality == ast_pb2.SOURCE: source_dict[entry.id] = entry.full_name elif entry.functionality in (ast_pb2.SINK, ast_pb2.DANGER): sink_dict[entry.id] = entry.full_name nodes = [] # dictionary with key=name of file within package found to contain vulnerabilities and value=tuple of (tree, asttok, visitor) for that file vuln_files_ASTs = {} for entry in results['vulnerabilities']: source = entry['source'] # source['label'], source['line_number'], source['path'] source_trigger_word = entry['source_trigger_word'] sink = entry['sink'] # sink['label'], sink['line_number'], sink['path'] sink_trigger_word = entry['sink_trigger_word'] api_type = entry['type'] reassignment_nodes = entry['reassignment_nodes'] # of type dict vuln_files_ASTs[source['path']] = () vuln_files_ASTs[sink['path']] = () nodes.append( Vulnerability(source, source_trigger_word, sink, sink_trigger_word, api_type, reassignment_nodes)) # initiate AST visitors (one tree per vulnerable file within package) for file in vuln_files_ASTs: src_ast = open(file, 'r').read() tree = ast.parse(src_ast, filename=file) asttok = asttokens.ASTTokens(source_text=src_ast, tree=tree, filename=file) # visitor = PythonVisitor(asttok=asttok) visit_info = (tree, asttok) vuln_files_ASTs[file] = visit_info # initialize result and summary result = ModuleResult() set_result(result, config.apis, source_dict, sink_dict, nodes, vuln_files_ASTs) summary = ModuleSummary() set_summary(summary, config.apis, source_dict, sink_dict, nodes, vuln_files_ASTs) static = ModuleStatic() static.flows.MergeFrom(result.flows) static.dangers.MergeFrom(result.dangers) static.sources.MergeFrom(summary.sources) static.sinks.MergeFrom(summary.sinks) static.taint_wrappers.MergeFrom(summary.taint_wrappers) write_proto_to_file(proto=static, filename=outfile, binary=False)
def astgen(self, inpath, outfile, root=None, configpath=None, pkg_name=None, pkg_version=None, evaluate_smt=False): """ There are two ways to implement the javascript ast parsing, each of them has their cons and pros. One is to directly use the npm esprima module, the other is to use the pypi esprima module. 1. The npm module is the latest version and has lots of features to use directly. But it doesn't have a visitor and requires manually implementation. 2. The pypi module is claimed to be a line by line translation of esprima in python, but it may be outdated and inactively maintained. However, it contains a visitor similar to python ast.NodeVisitor that we can directly use. To minimize the efforts, I currently choose the latter. """ analyze_path, is_decompress_path, outfile, root, configpath = self._sanitize_astgen_args( inpath=inpath, outfile=outfile, root=root, configpath=configpath, language=self.language) # load the config proto configpb = AstLookupConfig() read_proto_from_file(configpb, configpath, binary=False) logging.debug("loaded lookup config from %s:\n%s", configpath, configpb) # invoke the language specific ast generators to call functions # FIXME: current testdata sometimes fails the analyzer, inspect it! # get input files infiles, root = self._get_infiles(inpath=analyze_path, root=root, language=self.language) # initialize resultpb resultpb = PkgAstResults() pkg = resultpb.pkgs.add() pkg.config.CopyFrom(configpb) pkg.pkg_name = pkg_name if pkg_name is not None else basename( analyze_path) if pkg_version is not None: pkg.pkg_version = pkg_version pkg.language = ast_pb2.JAVASCRIPT for infile in infiles: all_source = open(infile, 'r').read() try: # tree = esprima.parseModule(), esprima.parseScript() tree = esprima.parse(all_source, options={'loc': True}) except Exception as e: logging.error( "Fatal error %s parsing file %s! Skipping this file!", e, infile) continue visitor = JavaScriptDeclRefVisitor(source=all_source, configpb=configpb) visitor.visit(tree) logging.warning("collected functions: %s", Counter(visitor.get_declrefs()).items()) filepb = self._get_filepb(infile, root) for base, name, args, source_text, source_range in visitor.get_declrefs( ): api_result = self._get_api_result(base, name, args, source_text, source_range, filepb) pkg.api_results.add().CopyFrom(api_result) # optionally evaluate smt formula if evaluate_smt: satisfied = self._check_smt(astgen_results=[resultpb], configpath=configpath) resultpb.pkgs[0].config.smt_satisfied = satisfied # save resultpb write_proto_to_file(resultpb, outfile, binary=False) # clean up residues self._cleanup_astgen(analyze_path=analyze_path, is_decompress_path=is_decompress_path)