def piping_errorHttpCode(executeid): '''异常状态码''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type': 'error_http_code'}) if not piping: return True if piping['extend_id']: extend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) else: extend = db.fetchone('select data from piping_extend where task_id=0 and piping_type=:type and status=1', {'type': 'error_http_code'}) if not extend: return True httpCodes = extend['data'].split("\n") results = [] mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url','http_code','url_type']) for row in mgRows: if row['url_type'] != 'self': continue if not row['http_code']: continue if execute['domain'] != getDomainNoPort(row['url']): continue if str(row['http_code']) in httpCodes: results.append(row) mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 3, ['id','url','http_code','url_type']) for row in mgRows: if row['url_type'] != 'self': continue if not row['http_code']: continue if execute['domain'] != getDomainNoPort(row['url']): continue if str(row['http_code']) in httpCodes: results.append(row) return result_save(execute, piping, results) if results else True
def piping_keyword(executeid): '''关键字过滤''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type': 'keyword'}) if not piping or not piping['extend_id']: return True #自有词库 pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) if not pipingExtend: return True if not pipingExtend['data']: return True rows = json.loads(pipingExtend['data']) if not rows: return True kws = {it['url']:it['words'] for it in rows} results = [] mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'file_path', 'file_extension','url_type']) for row in mgRows: # 非本网站的链接不给结果 if row['url_type'] != 'self': continue if row['url'] not in kws.keys(): continue if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue url = row['url'] body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore') result = Acism(kws[url]).scan(body) # set(kws[url])中有的而result.keys()中没有的 wordsNoExists = list(set(kws[url]).difference(set(result.keys()))) if wordsNoExists: filename = "snap_code_keyword_%s.png" % row['id'] snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=wordsNoExists) pipingResult = {"id":row['id'], "url":row['url'], "noWords": wordsNoExists, "words": kws[url], 'snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, pipingResult, snapshots) results.append(pipingResult) return result_save(execute, piping, results) if results else True
def piping_fingerprint(executeid): '''指纹检测''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) if not execute: return True piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid':execute['task_id'], 'type':'fingerprint'}) if not piping: return True executeOld = db.fetchone('select * from task_execute where task_id=:tid and id<:eid order by id desc', {'tid': execute['task_id'], 'eid':executeid}) if not executeOld: return True exts = ['js','css','jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp'] # 比对指纹 executeOldId = executeOld['id'] mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url','md5_url', 'md5_body', 'status', 'file_extension']) rowsDict = {} for row in mgRows: if row['file_extension'] not in exts: continue rowsDict[row['md5_url']] = row mgRows = mgdb.spiderurl_getall_executeid_status(executeOldId, 2, ['id', 'url','md5_url', 'md5_body', 'status', 'file_extension']) rowsDictOld = {} for row in mgRows: if row['file_extension'] not in exts: continue rowsDictOld[row['md5_url']] = row results = [] for md5Url, row in rowsDict.items(): if md5Url in rowsDictOld.keys(): if row['md5_body'] != rowsDictOld[md5Url]['md5_body']: filename = "snap_view_%s.png" % row['id'] snapshot = _snapshot_save(executeid, 'view', row['url'], filename)[0] pipingResult = { 'url':row['url'], 'md5_url':md5Url, 'md5_body_new':row['md5_body'], 'md5_body_old':rowsDictOld[md5Url]['md5_body'], 'snapshot':snapshot } snapshot_insert(executeid, piping, row, pipingResult, snapshot) results.append(pipingResult) if results: return result_save(execute, piping, results) else: return True
def piping_filterword(executeid): '''敏感词过滤''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'filterword'}) if not piping: return True #系统词库 systemWords = '' if piping['filterword_type'] in ['system','mixed']: pipingExtend = db.fetchall('select name from sys_filterword') systemWords = [row['name'] for row in pipingExtend] if pipingExtend else [] # 自有词库 ownWords = '' if piping['filterword_type'] in ['own', 'mixed']: pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) ownWords = pipingExtend['data'] if pipingExtend else '' words = [] if piping['filterword_type'] == 'system': words = systemWords if piping['filterword_type'] == 'own': words = ownWords.split("\n") if piping['filterword_type'] == 'mixed': words = systemWords + ownWords.split('\n') words = list(set(words)) # print(words,type(words)) if '' in words: words.remove('') if not words: return True acism = Acism(words) results = [] rows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url', 'file_path', 'file_extension','url_type']) for row in rows: if row['url_type'] != 'self':continue if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore') # body = open('demo.html', 'r').read() result = acism.scan(body) if result: filename = "snap_code_filterword_%s.png" % row['id'] snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=result.keys()) pipingResult = {"id":row['id'], "url":row['url'], "matches":result, 'snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, pipingResult, snapshots) results.append(pipingResult) if results: return result_save(execute, piping, results) else: return True
def piping_darklink(executeid): execute = mgdb.execute_getbyid(executeid) if not execute: return False piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'darklink'}) if not piping: return True pipingExtend = db.fetchall('select name from sys_filterword') words = [row['name'] for row in pipingExtend] if pipingExtend else [] # if not words: return True acism = Acism(words) #查询出系统黑白名单表里面的url rows = db.fetchall('select domain from dk_white_list') whites = [row['domain'] for row in rows] if rows else [] rows = db.fetchall('select domain from dk_black_list') blacks = [row['domain'] for row in rows] if rows else [] #拼接白名单和黑名单链接,并去重 whites_glb = list(set(whites)) blacks_glb = list(set(blacks)) #查询出个人黑白名单表里面的url pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) whites = eval(pipingExtend['data'])['white_list'] blacks = eval(pipingExtend['data'])['black_list'] whites_psl = list(set(whites)) blacks_psl = list(set(blacks)) mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'md5_url', 'md5_body', 'url_type','file_extension','file_path', 'invisible','referer']) results = [] for row in mgRows: # 匹配个人黑白名单里面的url if row['url'] in whites_psl:continue if row['url'] in blacks_psl: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']]) result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) continue # 静态文件不是暗链 if row['url_type'] != 'other':continue if row['file_extension'] not in ['', 'html']:continue # 匹配系统黑白名单(判定结果疑似度百分百 absolute) if row['url'] in whites_glb:continue if row['url'] in blacks_glb: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']]) result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) continue # 敏感词检测(判定结果疑似度高 high) # if words: body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore') resultWord = acism.scan(body) if resultWord: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=[row['url']]) result = {'id': row['id'], 'referer': row['referer'], 'darklink': row['url'], 'level': 'high','snapshot': "\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) finddata = {'domain':execute['domain'], 'md5_body':row['md5_body']} setdata = {'$set':{'filterwords':json.dumps(resultWord, ensure_ascii=False)}} mongoSpider['outlink'].find_and_modify(finddata, setdata) continue # 检测是否可见(判定结果疑似度低 low) if row['invisible']: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']]) result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'low','snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) continue # # 检测是否重复(超过引用阈值疑似度高 high/没有超过引用阈值疑似度中 medium) # if row['file_extension']: # body = urlparse() # # 检测引用次数 @未严格定义,待定 # match = {'$match':{'md5_url':row['md5_url']}} # group = {'$group':{'_id':'$domain', 'count':{'$sum':1}}} # results = [i for i in mongoSpider['outlink'].aggregate([match, group])] # if len(results) > 500: if results:return result_save(execute, piping, results) if results else True