def find_instruction_capabilities( ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle, bb: BBHandle, insn: InsnHandle ) -> Tuple[FeatureSet, MatchResults]: """ find matches for the given rules for the given instruction. returns: tuple containing (features for instruction, match results for instruction) """ # all features found for the instruction. features = collections.defaultdict(set) # type: FeatureSet for feature, va in itertools.chain( extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features() ): features[feature].add(va) # matches found at this instruction. _, matches = ruleset.match(Scope.INSTRUCTION, features, int(insn)) for rule_name, res in matches.items(): rule = ruleset[rule_name] for va, _ in res: capa.engine.index_rule_matches(features, rule, [va]) return features, matches
def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet): file_features = collections.defaultdict(set) # type: FeatureSet for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()): # not all file features may have virtual addresses. # if not, then at least ensure the feature shows up in the index. # the set of addresses will still be empty. if va: file_features[feature].add(va) else: if feature not in file_features: file_features[feature] = set() logger.debug("analyzed file and extracted %d features", len(file_features)) file_features.update(function_features) _, matches = ruleset.match(Scope.FILE, file_features, 0x0) return matches, len(file_features)
def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle): # contains features from: # - insns # - function function_features = collections.defaultdict(set) # type: FeatureSet bb_matches = collections.defaultdict(list) # type: MatchResults for feature, va in extractor.extract_function_features(f): function_features[feature].add(va) for bb in extractor.get_basic_blocks(f): # contains features from: # - insns # - basic blocks bb_features = collections.defaultdict(set) for feature, va in extractor.extract_basic_block_features(f, bb): bb_features[feature].add(va) function_features[feature].add(va) for insn in extractor.get_instructions(f, bb): for feature, va in extractor.extract_insn_features(f, bb, insn): bb_features[feature].add(va) function_features[feature].add(va) _, matches = capa.engine.match(ruleset.basic_block_rules, bb_features, int(bb)) for rule_name, res in matches.items(): bb_matches[rule_name].extend(res) for va, _ in res: function_features[capa.features.common.MatchedRule( rule_name)].add(va) _, function_matches = capa.engine.match(ruleset.function_rules, function_features, int(f)) return function_matches, bb_matches, len(function_features)
def find_basic_block_capabilities( ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle, bb: BBHandle ) -> Tuple[FeatureSet, MatchResults, MatchResults]: """ find matches for the given rules within the given basic block. returns: tuple containing (features for basic block, match results for basic block, match results for instructions) """ # all features found within this basic block, # includes features found within instructions. features = collections.defaultdict(set) # type: FeatureSet # matches found at the instruction scope. # might be found at different instructions, thats ok. insn_matches = collections.defaultdict(list) # type: MatchResults for insn in extractor.get_instructions(f, bb): ifeatures, imatches = find_instruction_capabilities(ruleset, extractor, f, bb, insn) for feature, vas in ifeatures.items(): features[feature].update(vas) for rule_name, res in imatches.items(): insn_matches[rule_name].extend(res) for feature, va in itertools.chain( extractor.extract_basic_block_features(f, bb), extractor.extract_global_features() ): features[feature].add(va) # matches found within this basic block. _, matches = ruleset.match(Scope.BASIC_BLOCK, features, int(bb)) for rule_name, res in matches.items(): rule = ruleset[rule_name] for va, _ in res: capa.engine.index_rule_matches(features, rule, [va]) return features, matches, insn_matches
def find_code_capabilities( ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle ) -> Tuple[MatchResults, MatchResults, MatchResults, int]: """ find matches for the given rules within the given function. returns: tuple containing (match results for function, match results for basic blocks, match results for instructions, number of features) """ # all features found within this function, # includes features found within basic blocks (and instructions). function_features = collections.defaultdict(set) # type: FeatureSet # matches found at the basic block scope. # might be found at different basic blocks, thats ok. bb_matches = collections.defaultdict(list) # type: MatchResults # matches found at the instruction scope. # might be found at different instructions, thats ok. insn_matches = collections.defaultdict(list) # type: MatchResults for bb in extractor.get_basic_blocks(f): features, bmatches, imatches = find_basic_block_capabilities(ruleset, extractor, f, bb) for feature, vas in features.items(): function_features[feature].update(vas) for rule_name, res in bmatches.items(): bb_matches[rule_name].extend(res) for rule_name, res in imatches.items(): insn_matches[rule_name].extend(res) for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()): function_features[feature].add(va) _, function_matches = ruleset.match(Scope.FUNCTION, function_features, int(f)) return function_matches, bb_matches, insn_matches, len(function_features)
def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle): # contains features from: # - insns # - function function_features = collections.defaultdict(set) # type: FeatureSet bb_matches = collections.defaultdict(list) # type: MatchResults for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()): function_features[feature].add(va) for bb in extractor.get_basic_blocks(f): # contains features from: # - insns # - basic blocks bb_features = collections.defaultdict(set) for feature, va in itertools.chain( extractor.extract_basic_block_features(f, bb), extractor.extract_global_features()): bb_features[feature].add(va) function_features[feature].add(va) for insn in extractor.get_instructions(f, bb): for feature, va in itertools.chain( extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features()): bb_features[feature].add(va) function_features[feature].add(va) _, matches = ruleset.match(Scope.BASIC_BLOCK, bb_features, int(bb)) for rule_name, res in matches.items(): bb_matches[rule_name].extend(res) rule = ruleset[rule_name] for va, _ in res: capa.engine.index_rule_matches(function_features, rule, [va]) _, function_matches = ruleset.match(Scope.FUNCTION, function_features, int(f)) return function_matches, bb_matches, len(function_features)
def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_progress=None) -> Tuple[MatchResults, Any]: all_function_matches = collections.defaultdict(list) # type: MatchResults all_bb_matches = collections.defaultdict(list) # type: MatchResults meta = { "feature_counts": { "file": 0, "functions": {}, }, "library_functions": {}, } # type: Dict[str, Any] pbar = tqdm.tqdm if disable_progress: # do not use tqdm to avoid unnecessary side effects when caller intends # to disable progress completely pbar = lambda s, *args, **kwargs: s functions = list(extractor.get_functions()) n_funcs = len(functions) pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions") for f in pb: function_address = int(f) if extractor.is_library_function(function_address): function_name = extractor.get_function_name(function_address) logger.debug("skipping library function 0x%x (%s)", function_address, function_name) meta["library_functions"][function_address] = function_name n_libs = len(meta["library_functions"]) percentage = 100 * (n_libs / n_funcs) if isinstance(pb, tqdm.tqdm): pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) continue function_matches, bb_matches, feature_count = find_function_capabilities( ruleset, extractor, f) meta["feature_counts"]["functions"][function_address] = feature_count logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count) for rule_name, res in function_matches.items(): all_function_matches[rule_name].extend(res) for rule_name, res in bb_matches.items(): all_bb_matches[rule_name].extend(res) # collection of features that captures the rule matches within function and BB scopes. # mapping from feature (matched rule) to set of addresses at which it matched. function_and_lower_features = { capa.features.common.MatchedRule(rule_name): set(map(lambda p: p[0], results)) for rule_name, results in itertools.chain(all_function_matches.items(), all_bb_matches.items()) } # type: FeatureSet all_file_matches, feature_count = find_file_capabilities( ruleset, extractor, function_and_lower_features) meta["feature_counts"]["file"] = feature_count matches = { rule_name: results for rule_name, results in itertools.chain( # each rule exists in exactly one scope, # so there won't be any overlap among these following MatchResults, # and we can merge the dictionaries naively. all_bb_matches.items(), all_function_matches.items(), all_file_matches.items(), ) } return matches, meta