def extractScriptsAndGenerateASTNodesFromURLListFinerBlock(path): f = open(path) scriptdict = {} total_script_count = {} total_uniq_script_blocks = 0 total_json_count = {} total_uniq_json_blocks = 0 for line in f: url = line.strip() print "process url "+url hosts, inlines = fetchScripts(url) if inlines==None or len(inlines) ==0: print "no inlines for "+url continue for inline in inlines: #print "INLINE:%s" % inline is_json = False #rs = analyzeJSCodes(inline) rs, sc = analyzeJSCodesFinerBlock(inline) if rs == None: rs = analyzeJSON(inline) is_json = True if rs == None: continue if is_json: tree = TemplateTree(rs, None) if not tree.key in scriptdict: scriptdict[tree.key] = [(json.dumps(rs), url, tree, -1)] total_json_count[tree.key] = 1 total_script_count[tree.key] = 1 else: scriptdict[tree.key].append((inline, url, tree, -1)) total_json_count[tree.key] += 1 total_script_count[tree.key] += 1 else: for index in range(len(rs)): seq = rs[index] tree = TemplateTree(seq, None) key = tree.key if not key in scriptdict: scriptdict[key] = [(sc[index], url, tree, index)] total_script_count[key] = 1 print " add key %s" %key else: contents = [x[0] for x in scriptdict[key]] if not sc[index] in contents: scriptdict[key].append((sc[index],url, tree, index)) print " item %s has %d unique scripts" %(key, len(scriptdict[key])) total_script_count[key] += 1 return scriptdict, total_script_count, total_json_count
def matchScriptWithDomainTemplate(domain, script, treedict=None): if treedict == None: treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return None, None #print "fetched %d trees for domain" %(len(treedict)) is_json = False rs, sc = analyzeJSCodesFinerBlock(script) if rs == None: rs = analyzeJSON(script) is_json = True if rs == None: print "no script nor json" return [], [] allowed_sc = [] failed_sc = [] t1 = time() if is_json: tree = TemplateTree(rs, None) if simpleCompare(treedict, tree): #if compare(treedict, tree): allowed_sc.append(rs) print "JSON allowed " else: failed_sc.append(rs) print "JSON failed " else: print "generate %d subtrees for target script" % (len(rs)) for index in range(len(rs)): seq = rs[index] tree = TemplateTree(seq, None) key = tree.key #if simpleCompare(treedict, tree): if compare(treedict, tree): allowed_sc.append(sc[index]) else: failed_sc.append(sc[index]) print "allowed %d blocks, failed %d blocks" % (len(allowed_sc), len(failed_sc)) t2 = time() total_time = t2 - t1 total_size = len(allowed_sc) + len(failed_sc) if total_size != 0: avg_time = total_time / total_size print "MATCH_TIME: %f " % (avg_time) return allowed_sc, failed_sc
def matchScriptWithDomainTemplate(domain, script, treedict = None): if treedict == None: treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return None, None #print "fetched %d trees for domain" %(len(treedict)) is_json = False rs, sc = analyzeJSCodesFinerBlock(script) if rs == None: rs = analyzeJSON(script) is_json = True if rs == None: print "no script nor json" return [], [] allowed_sc = [] failed_sc = [] t1 = time() if is_json: tree = TemplateTree(rs, None) if simpleCompare(treedict, tree): #if compare(treedict, tree): allowed_sc.append(rs) print "JSON allowed " else: failed_sc.append(rs) print "JSON failed " else: print "generate %d subtrees for target script" %(len(rs)) for index in range(len(rs)): seq = rs[index] tree = TemplateTree(seq, None) key = tree.key #if simpleCompare(treedict, tree): if compare(treedict, tree): allowed_sc.append(sc[index]) else: failed_sc.append(sc[index]) print "allowed %d blocks, failed %d blocks" %(len(allowed_sc), len(failed_sc)) t2 = time() total_time = t2 - t1 total_size = len(allowed_sc) + len(failed_sc) if total_size != 0: avg_time = total_time / total_size print "MATCH_TIME: %f " %(avg_time) return allowed_sc, failed_sc
def matchTreesFromDomainWithScript(domain, script, treedict = None): if treedict == None: treedict = getTreesForDomainFromDB(domain) if treedict == None or len(treedict) == 0: print "failed to fetch trees for domain ", domain return None, None #print "fetched %d trees for domain" %(len(treedict)) is_json = False rs, sc = analyzeJSCodesFinerBlock(script) if rs == None: rs = analyzeJSON(script) is_json = True if rs == None: print "no script nor json" return [], [] allowed_sc = [] failed_sc = [] if is_json: tree = TemplateTree(rs, None) #if simpleCompare(treedict, tree): if compare(treedict, tree): allowed_sc.append(rs) print "JSON allowed " else: failed_sc.append(rs) print "JSON failed " else: print "generate %d subtrees for target script" %(len(rs)) for index in range(len(rs)): seq = rs[index] tree = TemplateTree(seq, None) key = tree.key if simpleCompare(treedict, tree): #if compare(treedict, tree): allowed_sc.append(sc[index]) else: failed_sc.append(sc[index]) print "allowed %d blocks, failed %d blocks" %(len(allowed_sc), len(failed_sc)) return allowed_sc, failed_sc
def extractScriptsAndGenerateASTNodesFromURLList(url_path): scriptdict = {} f = open(url_path) for line in f: url = line.strip() print "process url "+url hosts, inlines = fetchScripts(url) if inlines==None or len(inlines) ==0: print "no inlines for "+url continue for inline in inlines: #print "INLINE:%s" % inline is_json = False rs = analyzeJSCodes(inline) if rs == None: rs = analyzeJSON(inline) is_json = True if rs == None: continue m = hashlib.md5() if not is_json: for node in rs: m.update(node.tag) else: for k in rs: m.update(k) key = m.hexdigest() if not key in scriptdict: scriptdict[key] = [(inline,url,rs)] print " add key %s" %key else: contents = [x[0] for x in scriptdict[key]] if not inline in contents: scriptdict[key].append((inline,url, rs) ) print " item %s has %d distinct scripts" %(key, len(scriptdict[key])) f.close() return scriptdict
def generateTemplateBasedOnURLsFromFile(path, dst_path): f = open(path) scriptdict = {} total_script_blocks = 0 total_uniq_script_blocks = 0 debug_dict = {} static_scripts = 0 dynamic_scripts = 0 for line in f: url = line.strip() print "process url " + url hosts, inlines = fetchScripts(url) if inlines == None or len(inlines) == 0: print "no inlines for " + url continue for inline in inlines: is_json = False rs, sc = analyzeJSCodesFinerBlock(inline) if rs == None: rs = analyzeJSON(inline) is_json = True if rs == None: continue if is_json: tree = TemplateTree(rs, None) if not tree.key in scriptdict: scriptdict[tree.key] = [(inline, url, tree, -1)] debug_dict[tree.key] = [inline] else: debug_dict[tree.key].append(inline) contents = [x[0] for x in scriptdict[key]] if not inline in contents: scriptdict[tree.key].append((inline, url, tree, -1)) total_uniq_script_blocks += 1 total_script_blocks += 1 else: for index in range(len(rs)): total_script_blocks += 1 seq = rs[index] tree = TemplateTree(seq, None) key = tree.key if not key in scriptdict: debug_dict[key] = [sc[index]] scriptdict[key] = [(sc[index], url, tree, index)] print " add key %s" % key else: contents = [x[0] for x in scriptdict[key]] debug_dict[key].append(sc[index]) if not sc[index] in contents: scriptdict[key].append( (sc[index], url, tree, index)) print " item %s has %d unique scripts" % ( key, len(scriptdict[key])) total_uniq_script_blocks += 1 fw = open(os.path.join(dst_path, 'debug'), 'w') for k in debug_dict: vals = debug_dict[k] fw.write("%d %s \n" % (len(vals), k)) fw.write(" --EXAMPLE-- %s\n" % vals[0]) fw.close() #start to analyze trees #scriptdict[tree_key] = [(script, url, tree, index)] trees = [] insufficient_urls = {} keys = sorted(scriptdict.keys(), key=lambda k: len(scriptdict[k])) for key in keys: is_static = True name = "%d_%s" % (len(scriptdict[key]), key) fw = open(os.path.join(dst_path, name), 'w') for item in scriptdict[key]: fw.write(item[1] + "||" + str(item[3]) + " " + str(item[0]) + "\n") #make sure all template trees with the same key are the same script_list = scriptdict[key] length_list = sorted([len(item[2].nodes) for item in script_list]) seq_length = 0 if length_list[0] != length_list[-1]: fw.write("[ALERT] seq length is not consistent") fw.close() continue else: seq_length = length_list[0] #only handle JavaScript for now tree = script_list[0][2] if tree.type == "json": print "the inline is json!" fw.write("[TODO]: the inline is json. This is next step\n") fw.close() trees.append(tree) continue #process String/Object/Array nodes #script_list: [(script, url, tree, index)] fw.write("start analyzeing values\n") script_length = len(script_list) for i in range(seq_length): node = script_list[0][2].nodes[i] try: if node.tag == "String": vals = [item[2].nodes[i].value for item in script_list] encoded_val = [b64encode(x) for x in vals] #item = 'string%d: %s' %(i, ','.join(encoded_val)) #fw.write(item+"\n") tree.strings[i] = vals node_pattern = generateNodePattern(vals) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length tree.string_types_str[str(i)] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [ item[1] for item in script_list ] # testing #node_pattern = NodePattern() #r = node_pattern.loads(tree.string_types_str[i]) #if r == False: # print "node_pattern failed to load: "+tree.string_types_str[i] #else: # print "successfully loaded tree: "+tree.string_types_str[i] print "STRING%d: [TYPE:%s] [VALUE:%s]" \ %(i, tree.string_types_str[str(i)],','.join(encoded_val)) if node.tag == "Object": #debug = "tag:%s val:%s" \ # %(script_list[0][2].nodes[i].tag,str(script_list[0][2].nodes[i].value)) #print "DEBUG: %s" %debug rs = analyzeObjectResultHelper(script_list, i) rs = extractObjectValues(rs) type_dict = {} for k in rs: encoded_val = [b64encode(x) for x in rs[k]] node_pattern = generateNodePattern(rs[k]) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length type_dict[k] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [ item[1] for item in script_list ] #testing #node_pattern = NodePattern() #r = node_pattern.loads(type_dict[k]) #if r == False: # print "node_pattern failed to load: "+type_dict[k] #else: # print "successfully loaded tree: "+type_dict[k] print "OBJECT%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \ %(i, type_dict[k], k, ','.join(encoded_val)) tree.objects[i] = rs tree.object_types_str[str(i)] = type_dict if node.tag == "Array": rs = analyzeArrayResultHelper(script_list, i) rs = extractObjectValues(rs) type_dict = {} for k in rs: encoded_val = [b64encode(x) for x in rs[k]] #fw.write("array%d: %s:%s\n" % (i, k, ','.join(encoded_val)) ) node_pattern = generateNodePattern(rs[k]) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length type_dict[k] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [ item[1] for item in script_list ] #testing #node_pattern = NodePattern() #r = node_pattern.loads(type_dict[k]) #if r == False: # print "node_pattern failed to load: "+type_dict[k] #else: # print "successfully loaded tree: "+type_dict[k] print "ARRAY%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \ %(i, type_dict[k], k, ','.join(encoded_val)) tree.arrays[i] = rs tree.array_types_str[str(i)] = type_dict except Exception as e: displayErrorMsg("fetchAndProcessScriptsOfURLsFromFile",\ "excpetion in analyzing node %d %s " %(i, str(e))) if is_static: static_scripts += script_length print "Done writing %d items for file %s " % (len( scriptdict[key]), name) trees.append(tree) fw.close() #store trees trees = sorted(trees, key=lambda x: x.get_length()) fw = open(os.path.join(dst_path, "trees"), 'w') fw_json = open(os.path.join(dst_path, "jsons"), 'w') for i in range(len(trees)): tree_val = trees[i].dumps() url = scriptdict[trees[i].key][0][1] storeTree(url, trees[i].key, tree_val) fw.write("1 %.3d: %s\n" % (i, tree_val)) new_tree = TemplateTree(None, None) new_tree.loads(tree_val) if trees[i].type == "js": fw.write("2 %.3d: %s\n" % (i, getTreeSeq(new_tree.nodes))) elif trees[i].type == 'json': fw.write("2 %.3d: %s\n" % (i, json.dumps(new_tree.nodes))) fw.close() fw_json.close() print "generate %d trees for %d scripts uniqe[%d]" \ %(len(trees), total_script_blocks, total_uniq_script_blocks) print "static_scripts:%d dynamic_scripts:%d" % (static_scripts, dynamic_scripts) return insufficient_urls
def generateTemplateBasedOnURLsFromFile(path, dst_path): f = open(path) scriptdict = {} total_script_blocks = 0 total_uniq_script_blocks = 0 debug_dict = {} static_scripts = 0 dynamic_scripts = 0 for line in f: url = line.strip() print "process url "+url hosts, inlines = fetchScripts(url) if inlines==None or len(inlines) ==0: print "no inlines for "+url continue for inline in inlines: is_json = False rs, sc = analyzeJSCodesFinerBlock(inline) if rs == None: rs = analyzeJSON(inline) is_json = True if rs == None: continue if is_json: tree = TemplateTree(rs, None) if not tree.key in scriptdict: scriptdict[tree.key] = [(inline, url, tree, -1)] debug_dict[tree.key] = [inline] else: debug_dict[tree.key].append(inline) contents = [x[0] for x in scriptdict[key]] if not inline in contents: scriptdict[tree.key].append((inline, url, tree, -1)) total_uniq_script_blocks += 1 total_script_blocks += 1 else: for index in range(len(rs)): total_script_blocks += 1 seq = rs[index] tree = TemplateTree(seq, None) key = tree.key if not key in scriptdict: debug_dict[key] = [sc[index]] scriptdict[key] = [(sc[index], url, tree, index)] print " add key %s" %key else: contents = [x[0] for x in scriptdict[key]] debug_dict[key].append(sc[index]) if not sc[index] in contents: scriptdict[key].append((sc[index],url, tree, index)) print " item %s has %d unique scripts" %(key, len(scriptdict[key])) total_uniq_script_blocks += 1 fw = open(os.path.join(dst_path,'debug'),'w') for k in debug_dict: vals = debug_dict[k] fw.write("%d %s \n" %(len(vals),k)) fw.write(" --EXAMPLE-- %s\n" %vals[0]) fw.close() #start to analyze trees #scriptdict[tree_key] = [(script, url, tree, index)] trees = [] insufficient_urls = {} keys = sorted(scriptdict.keys(), key=lambda k:len(scriptdict[k])) for key in keys: is_static = True name = "%d_%s" %(len(scriptdict[key]),key) fw = open(os.path.join(dst_path,name), 'w') for item in scriptdict[key]: fw.write(item[1]+"||"+str(item[3])+" "+str(item[0])+"\n") #make sure all template trees with the same key are the same script_list = scriptdict[key] length_list = sorted([len(item[2].nodes) for item in script_list]) seq_length = 0 if length_list[0] != length_list[-1]: fw.write("[ALERT] seq length is not consistent") fw.close() continue else: seq_length = length_list[0] #only handle JavaScript for now tree = script_list[0][2] if tree.type == "json": print "the inline is json!" fw.write("[TODO]: the inline is json. This is next step\n") fw.close() trees.append(tree) continue #process String/Object/Array nodes #script_list: [(script, url, tree, index)] fw.write("start analyzeing values\n") script_length = len(script_list) for i in range(seq_length): node = script_list[0][2].nodes[i] try: if node.tag == "String": vals = [item[2].nodes[i].value for item in script_list] encoded_val = [b64encode(x) for x in vals] #item = 'string%d: %s' %(i, ','.join(encoded_val)) #fw.write(item+"\n") tree.strings[i] = vals node_pattern = generateNodePattern(vals) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length tree.string_types_str[str(i)] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [item[1] for item in script_list] # testing #node_pattern = NodePattern() #r = node_pattern.loads(tree.string_types_str[i]) #if r == False: # print "node_pattern failed to load: "+tree.string_types_str[i] #else: # print "successfully loaded tree: "+tree.string_types_str[i] print "STRING%d: [TYPE:%s] [VALUE:%s]" \ %(i, tree.string_types_str[str(i)],','.join(encoded_val)) if node.tag == "Object": #debug = "tag:%s val:%s" \ # %(script_list[0][2].nodes[i].tag,str(script_list[0][2].nodes[i].value)) #print "DEBUG: %s" %debug rs = analyzeObjectResultHelper(script_list, i) rs = extractObjectValues(rs) type_dict = {} for k in rs: encoded_val = [b64encode(x) for x in rs[k]] node_pattern = generateNodePattern(rs[k]) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length type_dict[k] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [item[1] for item in script_list] #testing #node_pattern = NodePattern() #r = node_pattern.loads(type_dict[k]) #if r == False: # print "node_pattern failed to load: "+type_dict[k] #else: # print "successfully loaded tree: "+type_dict[k] print "OBJECT%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \ %(i, type_dict[k], k, ','.join(encoded_val)) tree.objects[i] = rs tree.object_types_str[str(i)] = type_dict if node.tag == "Array": rs = analyzeArrayResultHelper(script_list, i) rs = extractObjectValues(rs) type_dict = {} for k in rs: encoded_val = [b64encode(x) for x in rs[k]] #fw.write("array%d: %s:%s\n" % (i, k, ','.join(encoded_val)) ) node_pattern = generateNodePattern(rs[k]) if is_static and \ ((node_pattern.tp!=StringType.CONST) and (node_pattern.tp!=StringType.INSUFFICIENT)): is_static = False dynamic_scripts += script_length type_dict[k] = node_pattern.dumps() if node_pattern.is_insufficient(): if not key in insufficient_urls: insufficient_urls[key] = \ [item[1] for item in script_list] else: insufficient_urls[key] += [item[1] for item in script_list] #testing #node_pattern = NodePattern() #r = node_pattern.loads(type_dict[k]) #if r == False: # print "node_pattern failed to load: "+type_dict[k] #else: # print "successfully loaded tree: "+type_dict[k] print "ARRAY%d: [TYPE:%s] [KEY:%s][VALUE:%s]" \ %(i, type_dict[k], k, ','.join(encoded_val)) tree.arrays[i] = rs tree.array_types_str[str(i)] = type_dict except Exception as e: displayErrorMsg("fetchAndProcessScriptsOfURLsFromFile",\ "excpetion in analyzing node %d %s " %(i, str(e))) if is_static: static_scripts += script_length print "Done writing %d items for file %s " %(len(scriptdict[key]), name) trees.append(tree) fw.close() #store trees trees = sorted(trees, key=lambda x:x.get_length()) fw = open(os.path.join(dst_path,"trees"), 'w') fw_json = open(os.path.join(dst_path,"jsons"), 'w') for i in range(len(trees)): tree_val = trees[i].dumps() url = scriptdict[trees[i].key][0][1] storeTree(url,trees[i].key, tree_val) fw.write( "1 %.3d: %s\n" %(i, tree_val)) new_tree = TemplateTree(None, None) new_tree.loads(tree_val) if trees[i].type == "js": fw.write( "2 %.3d: %s\n" %(i, getTreeSeq(new_tree.nodes))) elif trees[i].type == 'json': fw.write("2 %.3d: %s\n" % (i, json.dumps(new_tree.nodes))) fw.close() fw_json.close() print "generate %d trees for %d scripts uniqe[%d]" \ %(len(trees), total_script_blocks, total_uniq_script_blocks) print "static_scripts:%d dynamic_scripts:%d" %(static_scripts, dynamic_scripts) return insufficient_urls