Python spiderurl_getall_executeid_status示例，db.mongodb.spiderurl_getall_executeid_status Python示例

示例#1

0

显示文件

文件： piping.py 项目： xx1820017/lixiang

def piping_errorHttpCode(executeid):
    '''异常状态码'''
    execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid})
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type': 'error_http_code'})
    if not piping: return True
    if piping['extend_id']:
        extend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
    else:
        extend = db.fetchone('select data from piping_extend where task_id=0 and piping_type=:type and status=1', {'type': 'error_http_code'})
    if not extend:
        return True
    httpCodes = extend['data'].split("\n")
    results = []
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url','http_code','url_type'])
    for row in mgRows:
        if row['url_type'] != 'self': continue
        if not row['http_code']: continue
        if execute['domain'] != getDomainNoPort(row['url']): continue
        if str(row['http_code']) in httpCodes: results.append(row)
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 3, ['id','url','http_code','url_type'])
    for row in mgRows:
        if row['url_type'] != 'self': continue
        if not row['http_code']: continue
        if execute['domain'] != getDomainNoPort(row['url']): continue
        if str(row['http_code']) in httpCodes: results.append(row)
    return result_save(execute, piping, results) if results else True

示例#2

0

显示文件

文件： piping.py 项目： xx1820017/lixiang

def piping_keyword(executeid):
    '''关键字过滤'''
    execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid})
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type': 'keyword'})
    if not piping or not piping['extend_id']: return True
    #自有词库
    pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
    if not pipingExtend: return True
    if not pipingExtend['data']: return True
    rows = json.loads(pipingExtend['data'])
    if not rows: return True
    kws = {it['url']:it['words'] for it in rows}
    results = []
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'file_path', 'file_extension','url_type'])
    for row in mgRows:
        # 非本网站的链接不给结果
        if row['url_type'] != 'self': continue
        if row['url'] not in kws.keys(): continue
        if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue
        url = row['url']
        body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore')
        result = Acism(kws[url]).scan(body)
        # set(kws[url])中有的而result.keys()中没有的
        wordsNoExists = list(set(kws[url]).difference(set(result.keys())))
        if wordsNoExists:
            filename = "snap_code_keyword_%s.png" % row['id']
            snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=wordsNoExists)
            pipingResult = {"id":row['id'], "url":row['url'], "noWords": wordsNoExists, "words": kws[url], 'snapshot':"\n".join(snapshots)}
            snapshot_insert(executeid, piping, row, pipingResult, snapshots)
            results.append(pipingResult)
    return result_save(execute, piping, results) if results else True

示例#3

0

显示文件

文件： piping.py 项目： xx1820017/lixiang

def piping_fingerprint(executeid):
    '''指纹检测'''
    execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid})
    if not execute: return True
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid':execute['task_id'], 'type':'fingerprint'})
    if not piping: return True
    executeOld = db.fetchone('select * from task_execute where task_id=:tid and id<:eid order by id desc', {'tid': execute['task_id'], 'eid':executeid})
    if not executeOld: return True
    exts = ['js','css','jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp']
    # 比对指纹
    executeOldId = executeOld['id']
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url','md5_url', 'md5_body', 'status', 'file_extension'])
    rowsDict = {}
    for row in mgRows:
        if row['file_extension'] not in exts: continue
        rowsDict[row['md5_url']] = row
    mgRows = mgdb.spiderurl_getall_executeid_status(executeOldId, 2, ['id', 'url','md5_url', 'md5_body', 'status', 'file_extension'])
    rowsDictOld = {}
    for row in mgRows:
        if row['file_extension'] not in exts: continue
        rowsDictOld[row['md5_url']] = row
    results = []
    for md5Url, row in rowsDict.items():
        if md5Url in rowsDictOld.keys():
            if row['md5_body'] != rowsDictOld[md5Url]['md5_body']:
                filename = "snap_view_%s.png" % row['id']
                snapshot = _snapshot_save(executeid, 'view', row['url'], filename)[0]
                pipingResult = {
                    'url':row['url'],
                    'md5_url':md5Url,
                    'md5_body_new':row['md5_body'],
                    'md5_body_old':rowsDictOld[md5Url]['md5_body'],
                    'snapshot':snapshot
                }
                snapshot_insert(executeid, piping, row, pipingResult, snapshot)
                results.append(pipingResult)
    if results:
        return result_save(execute, piping, results)
    else:
        return True

示例#4

0

显示文件

文件： piping.py 项目： xx1820017/lixiang

def piping_filterword(executeid):
    '''敏感词过滤'''
    execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid})
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'filterword'})
    if not piping: return True

    #系统词库
    systemWords = ''
    if piping['filterword_type'] in ['system','mixed']:
        pipingExtend = db.fetchall('select name from sys_filterword')
        systemWords = [row['name'] for row in pipingExtend] if pipingExtend else []
    # 自有词库
    ownWords = ''
    if piping['filterword_type'] in ['own', 'mixed']:
        pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
        ownWords = pipingExtend['data'] if pipingExtend else ''
    words = []
    if piping['filterword_type'] == 'system':
        words = systemWords
    if piping['filterword_type'] == 'own':
        words = ownWords.split("\n")
    if piping['filterword_type'] == 'mixed':
        words = systemWords  + ownWords.split('\n')
    words = list(set(words))

    # print(words,type(words))
    if '' in words:
        words.remove('')
    if not words:
        return True

    acism = Acism(words)

    results = []
    rows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url', 'file_path', 'file_extension','url_type'])
    for row in rows:
        if row['url_type'] != 'self':continue
        if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue
        body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore')
        # body = open('demo.html', 'r').read()
        result = acism.scan(body)
        if result:
            filename = "snap_code_filterword_%s.png" % row['id']
            snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=result.keys())
            pipingResult = {"id":row['id'], "url":row['url'], "matches":result, 'snapshot':"\n".join(snapshots)}
            snapshot_insert(executeid, piping, row, pipingResult, snapshots)
            results.append(pipingResult)
    if results:
        return result_save(execute, piping, results)
    else:
        return True

示例#5

0

显示文件

文件： piping.py 项目： xx1820017/lixiang

def piping_darklink(executeid):

    execute = mgdb.execute_getbyid(executeid)
    if not execute: return False
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'darklink'})
    if not piping: return True
    pipingExtend = db.fetchall('select name from sys_filterword')
    words = [row['name'] for row in pipingExtend] if pipingExtend else []
    # if not words: return True
    acism = Acism(words)
    #查询出系统黑白名单表里面的url
    rows = db.fetchall('select domain from dk_white_list')
    whites = [row['domain'] for row in rows] if rows else []
    rows = db.fetchall('select domain from dk_black_list')
    blacks = [row['domain'] for row in rows] if rows else []
    #拼接白名单和黑名单链接，并去重
    whites_glb = list(set(whites))
    blacks_glb = list(set(blacks))
    #查询出个人黑白名单表里面的url
    pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
    whites = eval(pipingExtend['data'])['white_list']
    blacks = eval(pipingExtend['data'])['black_list']
    whites_psl = list(set(whites))
    blacks_psl = list(set(blacks))
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'md5_url', 'md5_body', 'url_type','file_extension','file_path', 'invisible','referer'])
    results = []
    for row in mgRows:
        # 匹配个人黑白名单里面的url
        if row['url'] in whites_psl:continue
        if row['url'] in blacks_psl:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']])
                result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                continue
        # 静态文件不是暗链
        if row['url_type'] != 'other':continue
        if row['file_extension'] not in ['', 'html']:continue
        # 匹配系统黑白名单(判定结果疑似度百分百 absolute)
        if row['url'] in whites_glb:continue
        if row['url'] in blacks_glb:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']])
                result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                continue
        # 敏感词检测(判定结果疑似度高 high)
        # if words:
        body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore')
        resultWord = acism.scan(body)
        if resultWord:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=[row['url']])
                result = {'id': row['id'], 'referer': row['referer'], 'darklink': row['url'], 'level': 'high','snapshot': "\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                finddata = {'domain':execute['domain'], 'md5_body':row['md5_body']}
                setdata = {'$set':{'filterwords':json.dumps(resultWord, ensure_ascii=False)}}
                mongoSpider['outlink'].find_and_modify(finddata, setdata)
                continue
        # 检测是否可见(判定结果疑似度低　low)
        if row['invisible']:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']])
                result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'low','snapshot':"\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                continue
        # # 检测是否重复(超过引用阈值疑似度高 high／没有超过引用阈值疑似度中　medium)
        # if row['file_extension']:
        #     body = urlparse()
        # # 检测引用次数    @未严格定义，待定
        # match = {'$match':{'md5_url':row['md5_url']}}
        # group = {'$group':{'_id':'$domain', 'count':{'$sum':1}}}
        # results = [i for i in mongoSpider['outlink'].aggregate([match, group])]
        # if len(results) > 500:
    if results:return result_save(execute, piping, results) if results else True