def sample_trim(self): """ Sample one and then trim it. return the trim one. If empty, raise an exception. """ my_cfg = self.sample_raw() #print([ prod for prod in my_cfg.productions if len(prod) == 2 and prod[0] == 'S']) logging.info( "CFG nominally has %d nonterminals, %d terminals, %d binary_rules and %d lexical rules", self.number_nonterminals, self.number_terminals, self.binary_rules, self.lexical_rules) ts = my_cfg.compute_trim_set() if len(ts) == 0: # empty language raise ValueError("Empty language") prods = my_cfg.compute_usable_productions(ts) terminals = set() for prod in prods: if len(prod) == 2: terminals.add(prod[1]) tcfg = cfg.CFG() tcfg.start = my_cfg.start tcfg.terminals = terminals tcfg.nonterminals = ts tcfg.productions = set(prods) logging.info( "Final CFG has %d nonterminals, %d terminals, %d binary_rules and %d lexical rules", len(tcfg.nonterminals), len(tcfg.terminals), len([prod for prod in tcfg.productions if len(prod) == 3]), len([prod for prod in tcfg.productions if len(prod) == 2])) return tcfg
def sample_raw(self): """ return a CFG """ lexicon = list(utility.generate_lexicon(self.number_terminals)) #DEBUGGING lexicon.sort() print(lexicon[0], lexicon[-1]) nonterminals = self.generate_nonterminals() lprods = set() bprods = set() lexicon_size = len(lexicon) while len(lprods) < self.lexical_rules: lhs = numpy.random.choice(nonterminals) rhs = lexicon[numpy.random.choice(range(lexicon_size))] lprods.add((lhs, rhs)) print(lhs, rhs) while len(bprods) < self.binary_rules: if self.strict_cnf: a = numpy.random.choice(nonterminals) b, c = numpy.random.choice(nonterminals[1:], size=2) else: a, b, c = numpy.random.choice(nonterminals, size=3) bprods.add((a, b, c)) print(a, b, c) my_cfg = cfg.CFG() my_cfg.start = nonterminals[0] my_cfg.nonterminals = set(nonterminals) my_cfg.terminals = set(lexicon) my_cfg.productions = lprods | bprods return my_cfg
def constructCFGs(insns, call_sites=[]): overall_cfg = cfg.CFG() overall_cfg.construct_new(insns, call_sites) cfgs = overall_cfg.partition() for c in cfgs: for bb in c.basicblocks: bb.cfg = c bb.iterations = 1 return cfgs
def sample_full(self): lexicon = list(utility.generate_lexicon(self.number_terminals)) #print("Lexicon",lexicon,self.number_terminals) nonterminals = self.generate_nonterminals() lprods = set() bprods = set() for a in nonterminals: for b in lexicon: lprods.add((a, b)) for a in nonterminals: for b in nonterminals[1:]: for c in nonterminals[1:]: bprods.add((a, b, c)) my_cfg = cfg.CFG() my_cfg.start = nonterminals[0] my_cfg.nonterminals = set(nonterminals) my_cfg.terminals = set(lexicon) my_cfg.productions = lprods | bprods #print(my_cfg.terminals) return my_cfg
def sample_uniform(self, lp=0.5, bp=0.5): """ Sample all productions Bernoulli for lexical and binary. Default 0.5. """ lexicon = list(utility.generate_lexicon(self.number_terminals)) #print("Lexicon",lexicon,self.number_terminals) nonterminals = self.generate_nonterminals() productions = [] for a in nonterminals: for b in lexicon: if numpy.random.random() < lp: productions.append((a, b)) for a in nonterminals: for b in nonterminals[1:]: for c in nonterminals[1:]: if numpy.random.random() < bp: productions.append((a, b, c)) my_cfg = cfg.CFG() my_cfg.start = nonterminals[0] my_cfg.nonterminals = set(nonterminals) my_cfg.terminals = set(lexicon) my_cfg.productions = productions return my_cfg
## ##g = CFG() ##g.nonterminals = ["S", "A"] ##g.alphabet = {"f":1, "a":0} ##g.productions = [["S", "f", ["A"]], ["A", "a", []]] ##g.costs = {"f":(lambda x: x[0] + 1), "a":(lambda x: 10)} ##(mu, minprods, order) = g.Knijkstra() ##assert mu["S"] == 11 ## ##trees = g.getTreesFromProds(minprods, order) ###print trees ###enums = g.EnumerateStrings(3) ## #### ###S should ignore A g2 = cfg.CFG() g2.nonterminals = ["S", "A"] g2.alphabet = {"f": 1, "a": 0} g2.productions = [["S", "f", ["A"]], ["A", "a", []], ["S", "a", []]] g2.costs = {"f": (lambda x: x[0] + 1), "a": (lambda x: 10)} assert g2.Knijkstra()[0]["S"] == 10 #should equal 10 ## ##g3 = CFG() ##g3.nonterminals = ["S", "A", "B"] ##g3.alphabet = {"f":1, "a":0, "b":0} ##g3.productions = [["S", "f", ["A"]], ["A", "a", []], ["A", "f", ["B"]], ["B", "b", []]] ##g3.costs = {"f":(lambda x: x[0] + 1), "a":(lambda x: 10), "b":(lambda x: 1)} ##assert g3.Knijkstra()[0]["S"]==3 #should be 3 ## ##
# Used for draw figures by seaborn # Ji Hongchen # 20200911 # ================================== import seaborn as sns import matplotlib.pyplot as plt import numpy as np import os import pandas as pd import satamethod import shutil import cfg from scipy import stats CFG = cfg.CFG() N_CLUSTER = 7 CLASS_LIST = [] for i in range(N_CLUSTER): CLASS_LIST.append('Class_' + str(i + 1)) PATH = '/Users/freud/Documents/MANU/lstmsom_data/exp20200617/analysis_patient_20200617/' CLAC_DICT = {'patient_age': 'avr', 'patient_weight': 'avr', 'patient_gender': 'chi', 'ajcc_stage': 'chi', 't_stage': 'chi', 'n_stage': 'chi', 'm_stage': 'chi'} OUTPATH = '/Users/freud/Documents/MANU/lstmsom_data/exp20200617/cluster/' def data_loader(dataframe, col_item): dataframe.drop(dataframe[dataframe[col_item].isin( CFG.clicfeat_dict[col_item]['delete'])].index, inplace=True) if CLAC_DICT[col_item] == 'chi': for i in CFG.clicfeat_dict[col_item]:
## ["S", "f", ["B", "A"]], ## #["A", "f", ["B", "A"]], ## #["S", "g", ["S"]], ## #["S", "f", ["S", "S"]], ## ["A", "g", ["B"]], ## ["B", "g", ["A"]]] ##tg5.alphabet = {p[1]:len(p[2]) for p in tg2.productions} ##tg5.costs = {'f':(lambda x: max(x)), 'g':(lambda x: x[0] + 1), 'a':(lambda x:1)} ###test5 = tg5.EnumerateStrings(5) ##tg5.checkCostFrequency(test5['S'], {2:2, 3:2, 4:1}) ## #### Test Grammar 4: Nand ###### ##My grammar doesn't allow for productions without functions, #Replaced vars and constants non-terminals with a production for each var & const nand = cfg.CFG() nand.root = "Start" nand.nonterminals = ["Start", "StartAnd"] nand.productions = [["Start", "a", []], ["Start", "b", []], ["Start", "c", []], ["Start", "d", []], ["Start", "true", []], ["Start", "false", []], ["Start", "not", ["StartAnd"]], ["StartAnd", "and", ["Start", "Start"]]] nand.alphabet = {p[1]: len(p[2]) for p in nand.productions} nand.costs = {a: (lambda x: 1 + sum(x)) for a in nand.alphabet} #nand.EnumerateStrings(10) #####Test Grammar 5: ITE ############ iteg = cfg.CFG() iteg.root = "Start" iteg.nonterminals = ["Start", "BoolExpr"] iteg.productions = [["Start", "0", []], ["Start", "1", []], ["Start", "2", []],
def parseString(filestring): parsed_file = parse_parentheses(filestring) parsed_file = removeExcess(parsed_file) #print str(parsed_file) defined_funcs = [] synth_pos = -1 for x in range(0,len(parsed_file)): if type(parsed_file[x]) == list: if parsed_file[x][0] == "synth-fun": synth_pos = x break if parsed_file[x][0] == "define-fun": defined_funcs.append(x) assert synth_pos != -1 #still need to handle defined funcs synthG = cfg.CFG() synthfunc = parsed_file[synth_pos] if len(synthfunc) < 5: #grammar isn't specified return None synthG.root = synthfunc[4][0][0] synthG.nonterminals = []# [synthG.root] for nontDef in synthfunc[4]: synthG.nonterminals.append(nontDef[0]) for nontDef in synthfunc[4]: #synthG.nonterminals.append(nontDef[0]) for prod in nontDef[2]: #if production is just defining a typed constant / variable if type(prod) == list and len(prod) == 2 and prod[1] == nontDef[1]: synthG.alphabet[prod[0]] = 0 synthG.productions.append([nontDef[0], prod[0], []]) #if production is a const or a single nonterminal elif type(prod) == str: #assert not prod in synthG.nonterminals, "A -> B production" #production of form A -> B if prod in synthG.nonterminals: #take RHS of B and append it to end of A B_rhs = -1 for x in synthfunc[4]: if x[0] == prod: B_rhs = x[2] assert B_rhs != -1 B_rhs = copy.copy(B_rhs) nontDef[2] += B_rhs else: synthG.alphabet[prod] = 0 synthG.productions.append([nontDef[0], prod, []]) #if production includes function calls elif type(prod) == list: assert len(prod) > 1 and type(prod[0]) == str if not prod[0] in synthG.alphabet: synthG.alphabet[prod[0]] = len(prod) - 1 #, "Multiple arities found in " + str(prod[0]) else: #print str(prod) assert synthG.alphabet[prod[0]] == len(prod) - 1, "Multiple arities found in '" + str(prod[0]) + "'" for p in prod[1:]: #rhs is nested function. e.g., S -> g(g(A)) #Adds new nonterminal B and yield S -> g(B) , B -> g(A) if type(p) == list: #newNT = nontDef[0] + "->" + str(prod) newNT = "[X[X" + nontDef[0] + str(p) + "X]X]" synthG.nonterminals.append(newNT) synthfunc[4].append([newNT, "Unknown", [p]]) #This will allow for multiple nested functions #NEED TO FIND WAY TO FIND TYPE p = newNT #when production is created, newNT will be in production #assert False, "handle nested production " + str(prod) #rhs is nontermina or terminal elif type(p) == str: if p not in synthG.nonterminals: if p in synthG.alphabet: assert synthG.alphabet[p] == 0 synthG.alphabet[p] = 0 if p[0].isupper(): print "uppercase terminal '" + p + "' in " + str(prod) synthG.productions.append([nontDef[0], prod[0], [p for p in prod[1:]]]) else: assert False, "no production type matches " + str(prod) #synthG.nonterminals = list(set(synthG.nonterminals)) #removes duplicates return synthG
else: print_help() else: print_help() return path, main if __name__ == "__main__": path, main_func = get_op() sym_tab = build_symtab(path) with open(path, 'r') as f: lines = f.readlines() _cfg_ = {} for key in sym_tab.keys(): _cfg_[key] = cfg.CFG( lines[sym_tab[key]["lines"][1]:sym_tab[key]["lines"][2]], key) assert main_func in _cfg_.keys(), ("Function %s not found!" % main_func) argument = [] for i in sym_tab['foo']['decl'].get_args(): key = i.get_name() tmp_l = float(input("Lower bound of %s << " % key)) tmp_r = float(input("Upper bound of %s << " % key)) argument.append([tmp_l, tmp_r, key]) cg = FCG(_cfg_, main_func, sym_tab) cg.set_entry_range(argument) flags = [False for i in cg.get_constraint_nodes()] updated = widen(cg, cg.get_entry_nodes(), flags, False) while updated:
def main(): # 命令行模式将下面注释去掉,并将IDA python相关函数加上注释,即InforExtraction单独为IDA脚本,后面的独立IDA外执行,调试模式时有用 ''' try: options,args = getopt.getopt(sys.argv[1:],"hf:", ["help","file="]) except getopt.GetoptError: sys.exit() binary = None for name,value in options: if name in ("-h","--help"): usage() sys.exit() if name in ("-f","--file"): binary = value if binary == None: usage() sys.exit() ''' print "[+]log: Start analysis" binary = idc.GetInputFilePath() isPIE = idc.GetDisasm(0) # 基址从0开始 if len(isPIE) == 0: proj = angr.Project(binary, load_options={'auto_load_libs': False,'extern_size': 0x800000}) # 基址从非0开始, ELF文件中有的需要手动设定基址为0,否则IDA分析的地址数据与angr分析的地址数据不一致 else: # 在最新版本中:custom_base_addr -> base_addr proj = angr.Project(binary, load_options={'main_opts':{'custom_base_addr':0},'auto_load_libs': False,'extern_size': 0x800000}) isPE = proj.loader.all_pe_objects if len(isPE) == 0: filetype = "ELF" else: filetype = "PE" InforExtraction.main(filetype) vftable_file = open("vftable","r") vftable_jsonstr = vftable_file.read() vftable_list = json.loads(vftable_jsonstr) vftable_file.close() if filetype == "PE": vbtable_file = open("vbtable","r") vbtable_jsonstr = vbtable_file.read() vbtable_list = json.loads(vbtable_jsonstr) vbtable_file.close() VTT_list = None elif filetype == "ELF": VTT_file = open("VTT","r") VTT_jsonstr = VTT_file.read() VTT_list = json.loads(VTT_jsonstr) VTT_file.close() vbtable_list = None ctor_file = open("ctor","r") ctor_jsonstr = ctor_file.read() ctor_list = json.loads(ctor_jsonstr) ctor_file.close() symbol_file = open("symbol","r") symbol_jsonstr = symbol_file.read() symbol_list = json.loads(symbol_jsonstr) symbol_file.close() #print vftable_list #print vbtable_list #print ctor_list # 生成ctor CFG start = time.time() start_points = [] for ctor_addr in ctor_list: start_points.append(int(ctor_addr,16)) for vftable in vftable_list: if vftable_list[vftable]["dtor"] != 0: start_points.append(int(vftable_list[vftable]["dtor"],16)) mycfg = cfg.CFG(proj=proj,start_points=start_points,symbol_list=symbol_list,thread_num=1) end = time.time() print "[+]log: Build ctor cfg completion. Time:%fs" % (end-start) #print_cfg(mycfg) print "[*]log: The number of analysis functions:%d" % len(mycfg.functions) # 进行覆写分析 start = time.time() myoverwrite = StaticTaintAnalysis.StaticTaintAnalysis(proj,mycfg,vftable_list,vbtable_list,VTT_list,ctor_list,symbol_list,filetype) end = time.time() print "[+]log: Overwrite analysis completion. Time:%fs" % (end-start) #print_overwrite(myoverwrite) #sys.exit() # 继承树生成 start = time.time() inheritance_tree = HeuristicReasoning.HeuristicReasoning(proj,mycfg,myoverwrite.ctor_list,vftable_list,symbol_list) end = time.time() print "[+]log: Build inherTree completion. Time:%fs" % (end-start) inheritance_tree.statistics() #inheritance_tree.draw_ctor() print_CHT(inheritance_tree) inheritance_tree.draw()