def get_data_flow(code, parser): try: tree = parser[0].parse(bytes(code, 'utf8')) root_node = tree.root_node tokens_index = tree_to_token_index(root_node) code = code.split('\n') code_tokens = [index_to_code_token(x, code) for x in tokens_index] index_to_code = {} for idx, (index, code) in enumerate(zip(tokens_index, code_tokens)): index_to_code[index] = (idx, code) try: DFG, _ = parser[1](root_node, index_to_code, {}) except: DFG = [] DFG = sorted(DFG, key=lambda x: x[1]) indexs = set() for d in DFG: if len(d[-1]) != 0: indexs.add(d[1]) for x in d[-1]: indexs.add(x) new_DFG = [] for d in DFG: if d[1] in indexs: new_DFG.append(d) codes = code_tokens dfg = new_DFG except: codes = code.split() dfg = [] #merge nodes dic = {} for d in dfg: if d[1] not in dic: dic[d[1]] = d else: dic[d[1]] = (d[0], d[1], d[2], list(set(dic[d[1]][3] + d[3])), list(set(dic[d[1]][4] + d[4]))) DFG = [] for d in dic: DFG.append(dic[d]) dfg = DFG return dfg
def extract_dataflow(code, parser, lang): #remove comments try: code = remove_comments_and_docstrings(code, lang) except: pass #obtain dataflow if lang == "php": code = "<?php" + code + "?>" try: tree = parser[0].parse(bytes(code, 'utf8')) root_node = tree.root_node tokens_index = tree_to_token_index(root_node) code = code.split('\n') code_tokens = [index_to_code_token(x, code) for x in tokens_index] index_to_code = {} for idx, (index, code) in enumerate(zip(tokens_index, code_tokens)): index_to_code[index] = (idx, code) try: DFG, _ = parser[1](root_node, index_to_code, {}) except: DFG = [] DFG = sorted(DFG, key=lambda x: x[1]) indexs = set() for d in DFG: if len(d[-1]) != 0: indexs.add(d[1]) for x in d[-1]: indexs.add(x) new_DFG = [] for d in DFG: if d[1] in indexs: new_DFG.append(d) dfg = new_DFG except: dfg = [] return code_tokens, dfg