示例#1
0
def piping_errorHttpCode(executeid):
    '''异常状态码'''
    execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid})
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type': 'error_http_code'})
    if not piping: return True
    if piping['extend_id']:
        extend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
    else:
        extend = db.fetchone('select data from piping_extend where task_id=0 and piping_type=:type and status=1', {'type': 'error_http_code'})
    if not extend:
        return True
    httpCodes = extend['data'].split("\n")
    results = []
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url','http_code','url_type'])
    for row in mgRows:
        if row['url_type'] != 'self': continue
        if not row['http_code']: continue
        if execute['domain'] != getDomainNoPort(row['url']): continue
        if str(row['http_code']) in httpCodes: results.append(row)
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 3, ['id','url','http_code','url_type'])
    for row in mgRows:
        if row['url_type'] != 'self': continue
        if not row['http_code']: continue
        if execute['domain'] != getDomainNoPort(row['url']): continue
        if str(row['http_code']) in httpCodes: results.append(row)
    return result_save(execute, piping, results) if results else True
示例#2
0
def piping_keyword(executeid):
    '''关键字过滤'''
    execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid})
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type': 'keyword'})
    if not piping or not piping['extend_id']: return True
    #自有词库
    pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
    if not pipingExtend: return True
    if not pipingExtend['data']: return True
    rows = json.loads(pipingExtend['data'])
    if not rows: return True
    kws = {it['url']:it['words'] for it in rows}
    results = []
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'file_path', 'file_extension','url_type'])
    for row in mgRows:
        # 非本网站的链接不给结果
        if row['url_type'] != 'self': continue
        if row['url'] not in kws.keys(): continue
        if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue
        url = row['url']
        body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore')
        result = Acism(kws[url]).scan(body)
        # set(kws[url])中有的而result.keys()中没有的
        wordsNoExists = list(set(kws[url]).difference(set(result.keys())))
        if wordsNoExists:
            filename = "snap_code_keyword_%s.png" % row['id']
            snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=wordsNoExists)
            pipingResult = {"id":row['id'], "url":row['url'], "noWords": wordsNoExists, "words": kws[url], 'snapshot':"\n".join(snapshots)}
            snapshot_insert(executeid, piping, row, pipingResult, snapshots)
            results.append(pipingResult)
    return result_save(execute, piping, results) if results else True
示例#3
0
def init_system():
    #导入初始token
    sql = "insert into app(id, unique_key, public_key, token, token_expired) values(1,'tester_app','-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEAvLWMYgTwkLMI8ZSw8Pd7NBKUVr0kbyqHijKOOQmR5/EKHOwgak0u\nu3+wBsllmIgfa4cT0zp4Gdd4hx2UmpIjG4eHwCgUCHHmCedu87/zEQhzE2do9p09\nBzPs7GG/azuynPJp6mZFxycaGZaoHH1d3FNWJ+yRBQ5UliFw01Tby3j7cV5u9fNU\nOjSZRGBNkHLxUi56kkbIZ46Wz14DVCjfZh6HRcwWKZHnQTDaIJKGKDbJoAbY/EIi\nrUc8OQl57PNq35hc0AJdFHa5oDQ5WtsCXx3q7XNhKjZdR/Vs4kljns5k9/zylJLn\nXI5ly2j46nz+feMaGVP1BdJpPUVWrAcgFQIDAQAB\n-----END RSA PUBLIC KEY-----','wbsllmigfa4ct0zp4gdd4hx2umpijg4e', '2017-03-22 17:01:53');"
    db.exec(sql)

    #@系统敏感词库
    pipingType = 'filterword'
    row = db.fetchone(
        "select * from piping_extend where task_id=:task_id and piping_type=:piping_type limit 1",
        {
            'task_id': 0,
            'piping_type': pipingType
        })
    if not row:
        insertRow = {
            "app_id": 0,
            "site_id": 0,
            "task_id": 0,
            "piping_type": pipingType,
            "data": "",
            "status": 1
        }
        extendId = db.insert("piping_extend", insertRow)
    else:
        extendId = row["id"]
    content = read("%s/doc/sensitive_word.txt" % PATH_ROOT)
    db.updatebyid("piping_extend", {
        "data": content.strip(),
        "status": 1
    }, extendId)

    #@系统异常状态码
    pipingType = 'err_http_code'
    row = db.fetchone(
        "select * from piping_extend where task_id=:task_id and piping_type=:piping_type limit 1",
        {
            'task_id': 0,
            'piping_type': pipingType
        })
    if not row:
        insertRow = {
            "app_id": 0,
            "site_id": 0,
            "task_id": 0,
            "piping_type": pipingType,
            "data": "",
            "status": 1,
        }
        extendId = db.insert("piping_extend", insertRow)
    else:
        extendId = row["id"]
    content = "\n".join(
        ['401', '402', '403', '404', '405', '500', '501', '502', '503', '504'])
    db.updatebyid("piping_extend", {
        "data": content.strip(),
        "status": 1
    }, extendId)
示例#4
0
def piping_filterword(executeid):
    '''敏感词过滤'''
    execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid})
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'filterword'})
    if not piping: return True

    #系统词库
    systemWords = ''
    if piping['filterword_type'] in ['system','mixed']:
        pipingExtend = db.fetchall('select name from sys_filterword')
        systemWords = [row['name'] for row in pipingExtend] if pipingExtend else []
    # 自有词库
    ownWords = ''
    if piping['filterword_type'] in ['own', 'mixed']:
        pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
        ownWords = pipingExtend['data'] if pipingExtend else ''
    words = []
    if piping['filterword_type'] == 'system':
        words = systemWords
    if piping['filterword_type'] == 'own':
        words = ownWords.split("\n")
    if piping['filterword_type'] == 'mixed':
        words = systemWords  + ownWords.split('\n')
    words = list(set(words))

    # print(words,type(words))
    if '' in words:
        words.remove('')
    if not words:
        return True

    acism = Acism(words)

    results = []
    rows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url', 'file_path', 'file_extension','url_type'])
    for row in rows:
        if row['url_type'] != 'self':continue
        if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue
        body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore')
        # body = open('demo.html', 'r').read()
        result = acism.scan(body)
        if result:
            filename = "snap_code_filterword_%s.png" % row['id']
            snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=result.keys())
            pipingResult = {"id":row['id'], "url":row['url'], "matches":result, 'snapshot':"\n".join(snapshots)}
            snapshot_insert(executeid, piping, row, pipingResult, snapshots)
            results.append(pipingResult)
    if results:
        return result_save(execute, piping, results)
    else:
        return True
示例#5
0
def task_start(taskid):
    try:
        task = db.fetchone('select * from task where id=:id', {'id': taskid})
        if not task: return False
        startUrls = json.loads(task['start_urls'])
        executedata = {
            'site_id': task['site_id'],
            'task_id': task['id'],
            'app_id': task['app_id'],
            'task_type': task['type'],
            'start_urls': task['start_urls'],
            'domain': getDomainNoPort(startUrls[0]),
            'exec_level': task['exec_level'],
            'limit_depth': task['limit_depth'],
            'limit_total': task['limit_total'],
            'limit_time': task['limit_time'],
            'limit_subdomain': task['limit_subdomain'],
            'limit_image': task['limit_image'],
            'limit_js': task['limit_js'],
            'limit_jsevent': task['limit_jsevent'],
            'exclude_urls': task['exclude_urls'],
            'url_unique_mode': task['url_unique_mode'],
            'notify_url': task['notify_url'],
            'source_ip': task['source_ip'],
            'proxies': task['proxies'],
            'status': 0,
        }
        executeid = db.insert('task_execute', executedata)
        return executeid
    except Exception as e:
        logger.exception(e)
        return False
示例#6
0
def task_get_id(id):
    task = db.fetchone('select * from task where id=:id', {'id': id})
    if not task: return False
    task['start_urls'] = _startUrls2Raw(task['start_urls'])
    task['create_at'] = formatTimestamp(task['create_at'])
    task['update_at'] = formatTimestamp(task['update_at'])
    return task
示例#7
0
def result_save(execute, piping, results):
    #数据入库
    pipingResult = {}
    pipingResult['app_id'] = execute['app_id']
    pipingResult['site_id'] = execute['site_id']
    pipingResult['task_id'] = execute['task_id']
    pipingResult['execute_id'] = execute['id']
    pipingResult['piping_id'] = piping['id']
    pipingResult['type'] = piping['type']
    '''
    处理的结果以json字符串的形式保存
    包含敏感词,关键字,指纹,错误状态码,暗链
    '''
    pipingResult['result'] = json.dumps(results, ensure_ascii=False)
    pipingResult['status'] = 1
    pipingResult['audit_status'] = 0
    resultOld = db.fetchone('select id from task_piping_result where execute_id=:eid and piping_id=:pid', {'eid': execute['id'], 'pid': piping['id']})

    if resultOld:
        resultId = resultOld['id']
        db.updatebyid('task_piping_result', pipingResult, resultId)
    else:
        resultId = db.insert('task_piping_result', pipingResult)
    bNotify.save(execute['id'], 'piping_%s' % piping['type'], {'piping_status':'ok'})
    return resultId
示例#8
0
def send(notifyid = None):
    reData = {'status':0, 'msg':'', 'donotify_notifyid':notifyid}
    row =  db.fetchone("select * from task_notify where id=:id", {'id': notifyid})
    if not row:
        return {'status':0, 'msg':'notify[%s] is not exists' % notifyid, 'donotify_notifyid':notifyid}

    try:
        data = {
            'id': row['id'],
            'app_id': row['app_id'],
            'task_id': row['task_id'],
            'site_id': row['site_id'],
            'execute_id': row['execute_id'],
            'task_type': row['task_type'],
        }
        requestData = json.loads(row['request_data']) if row['request_data'] else {}
        data = dict(data, **requestData)
        data = json.dumps(data, ensure_ascii=False)
        request = Request(row['notify_url'], method='POST')
        request.add_header('Content-Type', 'application/json')
        response = urlopen(request, data.encode('utf8'), timeout=5)
        body = response.read().decode()
        if body == 'ok':
            db.updatebyid('task_notify', {'status':'2', 'response_data':body, 'error': ''}, row['id'])
        else:
            error = 'the httpcode require 200, the body require ok;'
            db.updatebyid('task_notify', {'status':'301', 'response_data':body, 'error': error}, row['id'])
        return {'status':1, 'msg':'notify ok', 'donotify_notifyid':notifyid}
    except Exception as e:
        logger.error("doNotify::" + str(notifyid) + "::" + repr(e))
        db.updatebyid('task_notify', {'status':'3', 'error':repr(e)}, row['id'])
        return {'status':1, 'msg':repr(e), 'donotify_notifyid':notifyid}
示例#9
0
def getToken_key(key):
    appObj = db.fetchone('select * from app where unique_key=:key', {'key':key})
    if not appObj:
        return False

    tokenExpired = getTime("%Y-%m-%d %H:%M:%S" , (getTime() + 7200))
    token = md5(tokenExpired)
    db.updatebyid('app', {'token':token, 'token_expired':tokenExpired}, appObj['id'])
    return {'token':token, 'expired':tokenExpired}
示例#10
0
def _getSiteid(url):
    '''获取域名ID'''
    domain = urlparse(url)[1]
    if ':' in domain: domain = domain.split(':')[0]

    domainMain = getDomainMain(domain)
    siteDict = db.fetchone('select * from site where domain=:domain',
                           {'domain': domainMain})
    siteid = siteDict['id'] if siteDict else db.insert('site',
                                                       {'domain': domainMain})

    if domain != domainMain:
        domainDict = db.fetchone(
            'select * from domain where subdomain=:domain', {'domain': domain})
        if not domainDict:
            db.insert('domain', {'site_id': siteid, 'subdomain': domain})

    return siteid
示例#11
0
def get_id(settingid=None):
    sql = "select * from setting where id=%s"
    row = db.fetchone(sql, [settingid])
    if row:
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        return row
    else:
        return False
示例#12
0
def get_id(appid=None):
    row = db.fetchone("select * from app where id=:id", {'id':appid})
    if row:
        row['token_expired'] = formatTimestamp(row['token_expired'])
        row['create_at'] = formatTimestamp(row['create_at'])
        row['update_at'] = formatTimestamp(row['update_at'])
        del(row['public_key'])
        return row
    else:
        return False
示例#13
0
def execute_getnew_taskid(taskid):
    '''根据任务ID获取最新的执行信息'''
    execute = db.fetchone(
        'select * from task_execute where task_id=:id order by id desc',
        {'id': taskid})
    if not execute: return False
    execute['start_at'] = formatTimestamp(execute['start_at'])
    execute['end_at'] = formatTimestamp(execute['end_at'])
    execute['create_at'] = formatTimestamp(execute['create_at'])
    execute['update_at'] = formatTimestamp(execute['update_at'])
    return execute
示例#14
0
def task_getnew_id(taskid):
    execute = db.fetchone(
        'select * from task_execute where task_id=:task_id order by id desc limit 1',
        {'task_id': taskid})
    execute['start_at'] = formatTimestamp(
        execute['start_at']) if execute['start_at'] else ''
    execute['end_at'] = formatTimestamp(
        execute['end_at']) if execute['end_at'] else ''
    execute['create_at'] = formatTimestamp(execute['create_at'])
    execute['update_at'] = formatTimestamp(execute['update_at'])
    return execute
示例#15
0
def execute_get_id(executeid):
    '''根据执行ID获取执行信息'''
    execute = db.fetchone('select * from task_execute where id=:id',
                          {'id': executeid})
    if not execute: return False
    execute['start_at'] = formatTimestamp(
        execute['start_at']) if execute['start_at'] else ''
    execute['end_at'] = formatTimestamp(
        execute['end_at']) if execute['end_at'] else ''
    execute['create_at'] = formatTimestamp(execute['create_at'])
    execute['update_at'] = formatTimestamp(execute['update_at'])
    return execute
示例#16
0
 def _wrapper(*args, **kwargs):
     token = request.headers['token']
     appObj = db.fetchone("select * from app where token=:token",
                          {'token': token})
     if not appObj:
         return {
             'status': 'failed',
             'msg': 'token error',
             'token': token
         }, 200
     session['app'] = appObj
     return func(*args, **kwargs)
示例#17
0
def piping_fingerprint(executeid):
    '''指纹检测'''
    execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid})
    if not execute: return True
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid':execute['task_id'], 'type':'fingerprint'})
    if not piping: return True
    executeOld = db.fetchone('select * from task_execute where task_id=:tid and id<:eid order by id desc', {'tid': execute['task_id'], 'eid':executeid})
    if not executeOld: return True
    exts = ['js','css','jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp']
    # 比对指纹
    executeOldId = executeOld['id']
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url','md5_url', 'md5_body', 'status', 'file_extension'])
    rowsDict = {}
    for row in mgRows:
        if row['file_extension'] not in exts: continue
        rowsDict[row['md5_url']] = row
    mgRows = mgdb.spiderurl_getall_executeid_status(executeOldId, 2, ['id', 'url','md5_url', 'md5_body', 'status', 'file_extension'])
    rowsDictOld = {}
    for row in mgRows:
        if row['file_extension'] not in exts: continue
        rowsDictOld[row['md5_url']] = row
    results = []
    for md5Url, row in rowsDict.items():
        if md5Url in rowsDictOld.keys():
            if row['md5_body'] != rowsDictOld[md5Url]['md5_body']:
                filename = "snap_view_%s.png" % row['id']
                snapshot = _snapshot_save(executeid, 'view', row['url'], filename)[0]
                pipingResult = {
                    'url':row['url'],
                    'md5_url':md5Url,
                    'md5_body_new':row['md5_body'],
                    'md5_body_old':rowsDictOld[md5Url]['md5_body'],
                    'snapshot':snapshot
                }
                snapshot_insert(executeid, piping, row, pipingResult, snapshot)
                results.append(pipingResult)
    if results:
        return result_save(execute, piping, results)
    else:
        return True
示例#18
0
def piping_getall_taskid(taskid = None):
    ''' 获取数据处理通道'''
    pipings = {}
    taskPipings = db.fetchall('select * from task_piping where task_id=:id', {'id': taskid})
    for piping in taskPipings:
        piping['create_at'] = formatTimestamp(piping['create_at'])
        piping['update_at'] = formatTimestamp(piping['update_at'])
        pipingType = piping['type']
        if pipingType in ['filterword', 'keyword']:
            pipingExtend = db.fetchone('select * from piping_extend where id=:id', {'id': piping['extend_id']})
            piping['words'] = pipingExtend['data']
        pipings[pipingType] = piping
    return pipings
示例#19
0
def run():

    file_path = 'scripts/createdb.sql'
    with open(file_path, 'r') as file:
        script = file.read()

    with pymssql.connect(
        server='db.bigmountaintiger.com',
        port='1433',
        user='******',
        password='******',
        database='master'
    ) as conn:
        # Need to set autocommit to run drop/create DB DDLs
        conn.autocommit(True)
        with conn.cursor() as cur:
            cur.execute(script)

    r = db.fetchone('SELECT * FROM Test_table')
    print(r)
def print_current():
  r = db.fetchone('SELECT * FROM Test_table')
  print(r)
示例#21
0
def piping_darklink(executeid):

    execute = mgdb.execute_getbyid(executeid)
    if not execute: return False
    piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'darklink'})
    if not piping: return True
    pipingExtend = db.fetchall('select name from sys_filterword')
    words = [row['name'] for row in pipingExtend] if pipingExtend else []
    # if not words: return True
    acism = Acism(words)
    #查询出系统黑白名单表里面的url
    rows = db.fetchall('select domain from dk_white_list')
    whites = [row['domain'] for row in rows] if rows else []
    rows = db.fetchall('select domain from dk_black_list')
    blacks = [row['domain'] for row in rows] if rows else []
    #拼接白名单和黑名单链接,并去重
    whites_glb = list(set(whites))
    blacks_glb = list(set(blacks))
    #查询出个人黑白名单表里面的url
    pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']})
    whites = eval(pipingExtend['data'])['white_list']
    blacks = eval(pipingExtend['data'])['black_list']
    whites_psl = list(set(whites))
    blacks_psl = list(set(blacks))
    mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'md5_url', 'md5_body', 'url_type','file_extension','file_path', 'invisible','referer'])
    results = []
    for row in mgRows:
        # 匹配个人黑白名单里面的url
        if row['url'] in whites_psl:continue
        if row['url'] in blacks_psl:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']])
                result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                continue
        # 静态文件不是暗链
        if row['url_type'] != 'other':continue
        if row['file_extension'] not in ['', 'html']:continue
        # 匹配系统黑白名单(判定结果疑似度百分百 absolute)
        if row['url'] in whites_glb:continue
        if row['url'] in blacks_glb:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']])
                result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                continue
        # 敏感词检测(判定结果疑似度高 high)
        # if words:
        body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore')
        resultWord = acism.scan(body)
        if resultWord:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=[row['url']])
                result = {'id': row['id'], 'referer': row['referer'], 'darklink': row['url'], 'level': 'high','snapshot': "\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                finddata = {'domain':execute['domain'], 'md5_body':row['md5_body']}
                setdata = {'$set':{'filterwords':json.dumps(resultWord, ensure_ascii=False)}}
                mongoSpider['outlink'].find_and_modify(finddata, setdata)
                continue
        # 检测是否可见(判定结果疑似度低 low)
        if row['invisible']:
            words = [params['darklink'] for params in results] if results else []
            if row['url'] not in words:
                filename = 'snap_code_darklink_%s.png' % row['id']
                snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']])
                result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'low','snapshot':"\n".join(snapshots)}
                snapshot_insert(executeid, piping, row, result, snapshots)
                results.append(result)
                continue
        # # 检测是否重复(超过引用阈值疑似度高 high/没有超过引用阈值疑似度中 medium)
        # if row['file_extension']:
        #     body = urlparse()
        # # 检测引用次数    @未严格定义,待定
        # match = {'$match':{'md5_url':row['md5_url']}}
        # group = {'$group':{'_id':'$domain', 'count':{'$sum':1}}}
        # results = [i for i in mongoSpider['outlink'].aggregate([match, group])]
        # if len(results) > 500:
    if results:return result_save(execute, piping, results) if results else True
示例#22
0
def task_save(params=None):
    if not params['id'] and not params['start_urls']: return False
    if params['start_urls']:
        startUrls = params['start_urls'].split("\n")
        params['start_urls'] = json.dumps(startUrls, ensure_ascii=False)
    else:
        params['start_urls'] = ''

    #默认值
    defaultKeys = {
        'app_id': 0,
        'type': 'spider',
        'start_urls': '',
        'exec_level': 0,
        'limit_depth': 2,
        'limit_total': 1000,
        'limit_time': 0,
        'limit_subdomain': 0,
        'limit_image': 0,
        'limit_js': 0,
        'url_unique_mode': 'url-query',
        'notify_url': '',
        'exec_level': 0,
        'source_ip': '',
        'exclude_urls': '',
        'proxies': '',
        'crontab': '',
        'status': 0,
    }

    #处理定时任务
    rundate = None
    if 'execute_at' in params.keys() and params['execute_at']:
        rundate = datetime.strptime(params['execute_at'], '%Y-%m-%d %H:%M:%S')

    if 'execute_delay' in params.keys() and params['execute_delay']:
        rundateStr = getTime('%Y-%m-%d %H:%M:%S',
                             getTime() + params['execute_delay'])
        rundate = datetime.strptime(rundateStr, '%Y-%m-%d %H:%M:%S')

    #保存数据
    taskdata = {}
    keys = defaultKeys.keys()
    if params['id']:
        taskid = params['id']
        for key in keys:
            if key in params.keys() and params[key]:
                taskdata[key] = params[key]
        result = db.updatebyid('task', taskdata, taskid)
    else:
        taskdata['site_id'] = _getSiteid(startUrls[0])
        for key in keys:
            if key in params.keys() and params[key]:
                taskdata[key] = params[key]
            else:
                taskdata[key] = defaultKeys[key]
        taskid = db.insert('task', taskdata)

    #定时任务
    func_name = task_start
    jobid = 'task_%s' % taskid
    if rundate:
        job = db.getbyid('scheduler', jobid)
        if job:
            db.updatebyid('scheduler', {'run_date': rundate}, jobid)
        else:
            scheduler = {
                'id': jobid,
                'name': jobid,
                'func': 'business.task:task_start',
                'args': '[' + str(taskid) + ']',
                'trigger_type': 'date',
                'run_date': rundate,
                'coalesce': 0,
                'next_run_time': rundate,
                'max_instances': 3,
                'executor': 'default',
                'misfire_grace_time ': 1,
            }
            db.insert(scheduler)
        return taskid

    #非计划任务
    task = db.fetchone("select * from task where id=:id", {'id': taskid})
    if not task['crontab']:
        task_start(taskid)
        return taskid

    #删除计划任务
    if taskdata['status'] < 1 and taskdata['crontab']:
        db.exec('delete from scheduler where id=:id', {'id': jobid})
        return taskid

    #添加或修改计划任务
    job = db.getbyid('scheduler', jobid)
    cs = params['crontab'].split(' ')
    if job:
        crontab = '0 ' + task['crontab'] + ' * *,SMHdmwWY'
        db.updatebyid('scheduler', {'crontab': crontab}, jobid)
    else:
        tz = pytz.timezone('Asia/Shanghai')
        scheduler = {
            'id': jobid,
            'name': jobid,
            'func': 'business.task:task_start',
            'args': '[' + str(taskid) + ']',
            'trigger_type': 'cron',
            'crontab': '0 ' + task['crontab'] + ' * *,SMHdmwWY',
            'coalesce': 0,
            'next_run_time':
            datetime.now(tz=tz).strftime('%Y-%m-%d %H:%M:%S%z'),
            'max_instances': 3,
            'executor': 'default',
            'misfire_grace_time ': 1,
        }
        db.insert('scheduler', scheduler)

    return taskid
示例#23
0
def piping_save(rows=None,taskid=None):
    task = db.fetchone('select * from task where id=:id', {'id':taskid})
    for row in rows:
        taskPiping = db.fetchone('select * from task_piping where task_id=:tid and type=:type', {'tid':taskid, 'type':row['type']})
        extendId = 0
        pipingExtendOld = None
        if taskPiping:
            extendId = taskPiping['extend_id']
            pipingExtendOld = db.fetchone('select * from piping_extend where id=:id', {'id': extendId})
        # wordId 值为0,则取系统默认词库
        if row['type'] == 'darklink':
            pipingExtend={}
            pipingExtend['app_id'] = task['app_id']
            pipingExtend['site_id'] = task['site_id']
            pipingExtend['task_id'] = taskid
            pipingExtend['piping_type'] = row['type']
            white_list = json.dumps(row['white_list'], ensure_ascii=False) if row['white_list'] else '[]'
            white_list = {'white_list':eval(white_list)}
            black_list = json.dumps(row['black_list'], ensure_ascii=False) if row['black_list'] else '[]'
            black_list = {'black_list':eval(black_list)}
            pipingExtend['data'] = json.dumps(dict(white_list, **black_list))
            pipingExtend['status'] = 1
            if pipingExtendOld:
                db.updatebyid('piping_extend', pipingExtend, extendId)
            else:
                extendId = db.insert('piping_extend', pipingExtend)

        if row['type'] == 'filterword' and 'filterwords' in row.keys() and 'filterword_operate' in row.keys():
            words = []
            wordsOld = []
            wordsNew = row['filterwords'].replace(' ', '').split("\n")
            if pipingExtendOld:
                extendId = pipingExtendOld['id']
                wordsOld = pipingExtendOld['data'].split("\n") if pipingExtendOld['data'] else []
            # 覆盖自有词库
            if row['filterword_operate'] == 'own':
                words = wordsNew
            # 加词
            if row['filterword_operate'] == 'plus':
                words.extend(wordsNew)
                if wordsOld: words.extend(wordsOld)
            # 减词
            if row['filterword_operate'] == 'reduce' and wordsOld:
                wordsCommon = list(set(wordsNew) & set(wordsOld))
                for word in wordsCommon: wordsOld.remove(word)
                words = wordsOld
            if '' in words: words.remove('')
            words = list(set(words))
            pipingExtend = {}
            pipingExtend['app_id'] = task['app_id']
            pipingExtend['site_id'] = task['site_id']
            pipingExtend['task_id'] = taskid
            pipingExtend['piping_type'] = row['type']
            pipingExtend['data'] = "\n".join(words)
            pipingExtend['status'] = 1
            if pipingExtendOld:
                db.updatebyid('piping_extend', pipingExtend, extendId)
            else:
                extendId = db.insert('piping_extend', pipingExtend)
        # 处理关键字
        if row['type'] == 'keyword' and 'keywords' in row.keys():
            pipingExtend = {}
            pipingExtend['app_id'] = task['app_id']
            pipingExtend['site_id'] = task['site_id']
            pipingExtend['task_id'] = taskid
            pipingExtend['piping_type'] = row['type']
            pipingExtend['data'] = json.dumps(row['keywords'], ensure_ascii=False) if row['keywords'] else ''
            pipingExtend['status'] = 1
            if pipingExtendOld:
                db.updatebyid('piping_extend', pipingExtend, extendId)
            else:
                extendId = db.insert('piping_extend', pipingExtend)
        # 处理错误状态吗
        if row['type'] == 'error_http_code' and 'http_codes' in row.keys():
            pipingExtend = {}
            pipingExtend['app_id'] = task['app_id']
            pipingExtend['site_id'] = task['site_id']
            pipingExtend['task_id'] = taskid
            pipingExtend['piping_type'] = row['type']
            pipingExtend['data'] = row['http_codes']
            pipingExtend['status'] = 1
            if pipingExtendOld:
                db.updatebyid('piping_extend', pipingExtend, extendId)
            else:
                extendId = db.insert('piping_extend', pipingExtend)
        wordType = row['filterword_type'] if 'filterword_type' in row.keys() else ''
        status = row['status'] if 'status' in row.keys() else 1
        piping = {}
        piping['status'] = status
        piping['extend_id'] = extendId
        piping['filterword_type'] = wordType
        if taskPiping:
            pipingId = db.updatebyid('task_piping', piping, taskPiping['id'])
        else:
            piping['app_id'] = task['app_id']
            piping['site_id'] = task['site_id']
            piping['task_id'] = taskid
            piping['type'] = row['type']
            pipingId = db.insert('task_piping', piping)
    return True
示例#24
0
def crawl(urlInfo):
    uI = urlInfo
    execute = mgdb.execute_getbyid(urlInfo['execute_id'])
    if not execute: return False
    sql = "select * from task_piping where task_id=:task_id and type=:type and status=:status"
    pipingDark = db.fetchone(sql, {'task_id': execute['task_id'], 'type': 'darklink', 'status': 1})

    try:
        ##如果任务已结束,则返回
        #if execute['status'] == 2 or urlInfo['status'] == 2:
        #    return True

        logger.info("crawl:uid[%s]:tid[%s]:eid[%s]:method[%s]::%s" % (
            uI['id'], uI['task_id'], uI['execute_id'], uI['method'], uI['url']
        ))

        # 抓取页面,解析数据
        response = {}
        urlItems = []
        #proxy = {'url':'http://%s' % MIRROR_PROXY} if execute['task_type'] == 'mirror' else {}
        proxy = {}
        requestInfo = spiderRequest(urlInfo['url'], urlInfo['method'], urlInfo['request_headers'], proxy=proxy)

        # 请求错误,直接返回
        if requestInfo['error']:
            mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id'])
            return True

        # 304或其他状态码,直接返回
        if requestInfo['http_code'] != 200:
            mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id'])
            return True

        # 正常请求
        responseHeaders = requestInfo['response_headers']
        contentTypeRaw = responseHeaders['Content-Type'] if 'Content-Type' in responseHeaders.keys() else None
        contentType = parseContentType(contentTypeRaw, default = 'text/html')
        fileType = mime2file(contentType)
        #logger.debug("Content-Type::::::::" + contentTypeRaw + "::::" + contentType)

        #保存响应信息
        fileInfo = download(requestInfo, urlInfo, execute, fileType)
        response = _formatResponse(requestInfo, execute, urlInfo, fileInfo)
        mgdb.spiderurl_save(response, urlInfo['id'])

        #非html页面,直接返回
        if fileType != 'html': return True

        #外部连接,不再进一步分析
        if urlInfo['url_type'] != 'self': return True

        # 如果是单页面镜像,不分析页面
        if execute['task_type'] == 'mirror_one': return True

        #正则解析页面
        urlItems = parse_reg(requestInfo)
        #检测暗链
        if pipingDark:
            result = parse_darklink(requestInfo['url'])
            # logger.info('parse_darklink::::%s::::' % (result))
            darklinks = _formatUrls(result, 1) if result else []
            urlItems = urlItems + darklinks

        '''
        浏览器解析部分
        '''
        #if execute['limit_js']:
        #    results = parse_browser(requestInfo)
        #    if results: urlItems = urlItems + results

        # logger.info('parse_darklink::::%s::::%s' % ('urls_uniq', json.dumps(urlItems)))
        # url去重
        urlItems = _urls_uniq(urlItems)
        # 追加新的URL
        undos = []
        mirrors = []
        queueOut = []
        outlinks = []
        queueSite = []
        # logger.info('parse_darklink::::%s::::' % (urlItems))
        # logger.info('parse_darklink::::%s::::%s' % ('urlItems', json.dumps(urlItems)))
        for row in urlItems:
            url = row['url'].strip()
            if not isUrl(url): continue

            fileExtension = extension(url)

            urlType = _getDomainType(url, execute['domain'])
            # isExists = _checkUrlExists(execute['id'], url, row['method'])
            isExists = _checkUrlExists(execute['id'], url, row['method'], row['invisible'])
            if isExists: continue

            flagOutlink = 0
            item = {}
            item['site_id'] = execute['site_id']
            item['task_id'] = execute['task_id']
            item['app_id'] = execute['app_id']
            item['execute_id'] = execute['id']
            item['task_type'] = execute['task_type']
            item['url'] = url
            item['url_type'] = urlType
            item['file_extension'] = fileExtension
            item['method'] = row['method']
            item['invisible'] = row['invisible']
            item['post'] = json.dumps(row['post'], ensure_ascii=False) if row['post'] else ''

            # 非本站链接或不分析暗链,状态标为5,即不需要抓取

            item['status'] = 5
            if urlType == 'self':
                item['status'] = 0
            else:
                if fileExtension in staticExts:
                    item['status'] = 0
                else:
                    if pipingDark: 
                        flagOutlink = 1
                        item['status'] = 0
            if urlType == 'other': 
                outlinks.append(_formatOutlink(execute, urlInfo['url'], url, row['invisible']))
            item['referer'] = urlInfo['url']
            item['exec_level'] = execute['exec_level']
            item['depth'] = int(urlInfo['depth']) + 1
            item['query'] = row['query']
            item['pattern_path'] = row['pattern_path']
            item['pattern_query'] = row['pattern_query']
            item['create_at'] = now_format()
            item['update_at'] = now_format()
            if flagOutlink:
                queueOut.append(item)
            else:
                queueSite.append(item)

        # logger.info('22parse_darklink::::%s::::%s' % ('queueSite', json.dumps(queueSite)))
        # logger.info('22parse_darklink::::%s::::%s' % ('queueOut', json.dumps(queueOut)))
        if urlItems:
            mgdb.c_insert('parse', _formatParse(execute, urlInfo, urlItems, response['md5_body'], 'regular'))
        if outlinks: mgdb.c_insert_batch('outlink', outlinks)
        stats = Mq.get_stats_batch('spider', execute['id'])
        if queueSite:
            # logger.info('parse_darklink::::::::%s' % (queueSite))
            results = mgdb.c_insert_batch('spiderurl', queueSite)
            for item in results:
                # 状态位非0,不抓取
                if item['status'] != 0: continue
                # 深度超过限制,不抓取
                if item['depth'] > execute['limit_depth']: continue
                # 总数超过限制,不抓取
                if stats['total'] > execute['limit_total']: continue
                # 镜像,不抓取图片
                if execute['task_type'] == 'mirror' and item['file_extension'] in staticExts: continue
                # 单页面监测,不抓取子页面
                if execute['task_type'] in ['monitor_one', 'mirror_one'] and item['file_extension'] not in staticExts: continue
                # 不抓取图片
                if not execute['limit_image'] and item['file_extension'] in staticExts: continue
                item[batchKey] = item['execute_id']
                item[mqidKey] = item['id']

                #数据放入待抓取队列
                undos.append(item)

                #数据放入镜像队列
                if execute['task_type'] == 'mirror': mirrors.append(item)
        if queueOut:
            # logger.info('parse_darklink::::::::%s' % (queueOut))
            results = mgdb.c_insert_batch('spiderurl', queueOut)
            for item in results: 
                item[batchKey] = item['execute_id']
                item[mqidKey] = item['id']
                undos.append(item)
        if undos: Mq.produce(undos, 'spider')
        if mirrors: Mq.produce(mirrors, 'mirror')

    except Exception as e:
        logger.exception(e)
        return False