def piping_errorHttpCode(executeid): '''异常状态码''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type': 'error_http_code'}) if not piping: return True if piping['extend_id']: extend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) else: extend = db.fetchone('select data from piping_extend where task_id=0 and piping_type=:type and status=1', {'type': 'error_http_code'}) if not extend: return True httpCodes = extend['data'].split("\n") results = [] mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url','http_code','url_type']) for row in mgRows: if row['url_type'] != 'self': continue if not row['http_code']: continue if execute['domain'] != getDomainNoPort(row['url']): continue if str(row['http_code']) in httpCodes: results.append(row) mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 3, ['id','url','http_code','url_type']) for row in mgRows: if row['url_type'] != 'self': continue if not row['http_code']: continue if execute['domain'] != getDomainNoPort(row['url']): continue if str(row['http_code']) in httpCodes: results.append(row) return result_save(execute, piping, results) if results else True
def piping_keyword(executeid): '''关键字过滤''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type': 'keyword'}) if not piping or not piping['extend_id']: return True #自有词库 pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) if not pipingExtend: return True if not pipingExtend['data']: return True rows = json.loads(pipingExtend['data']) if not rows: return True kws = {it['url']:it['words'] for it in rows} results = [] mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'file_path', 'file_extension','url_type']) for row in mgRows: # 非本网站的链接不给结果 if row['url_type'] != 'self': continue if row['url'] not in kws.keys(): continue if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue url = row['url'] body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore') result = Acism(kws[url]).scan(body) # set(kws[url])中有的而result.keys()中没有的 wordsNoExists = list(set(kws[url]).difference(set(result.keys()))) if wordsNoExists: filename = "snap_code_keyword_%s.png" % row['id'] snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=wordsNoExists) pipingResult = {"id":row['id'], "url":row['url'], "noWords": wordsNoExists, "words": kws[url], 'snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, pipingResult, snapshots) results.append(pipingResult) return result_save(execute, piping, results) if results else True
def init_system(): #导入初始token sql = "insert into app(id, unique_key, public_key, token, token_expired) values(1,'tester_app','-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEAvLWMYgTwkLMI8ZSw8Pd7NBKUVr0kbyqHijKOOQmR5/EKHOwgak0u\nu3+wBsllmIgfa4cT0zp4Gdd4hx2UmpIjG4eHwCgUCHHmCedu87/zEQhzE2do9p09\nBzPs7GG/azuynPJp6mZFxycaGZaoHH1d3FNWJ+yRBQ5UliFw01Tby3j7cV5u9fNU\nOjSZRGBNkHLxUi56kkbIZ46Wz14DVCjfZh6HRcwWKZHnQTDaIJKGKDbJoAbY/EIi\nrUc8OQl57PNq35hc0AJdFHa5oDQ5WtsCXx3q7XNhKjZdR/Vs4kljns5k9/zylJLn\nXI5ly2j46nz+feMaGVP1BdJpPUVWrAcgFQIDAQAB\n-----END RSA PUBLIC KEY-----','wbsllmigfa4ct0zp4gdd4hx2umpijg4e', '2017-03-22 17:01:53');" db.exec(sql) #@系统敏感词库 pipingType = 'filterword' row = db.fetchone( "select * from piping_extend where task_id=:task_id and piping_type=:piping_type limit 1", { 'task_id': 0, 'piping_type': pipingType }) if not row: insertRow = { "app_id": 0, "site_id": 0, "task_id": 0, "piping_type": pipingType, "data": "", "status": 1 } extendId = db.insert("piping_extend", insertRow) else: extendId = row["id"] content = read("%s/doc/sensitive_word.txt" % PATH_ROOT) db.updatebyid("piping_extend", { "data": content.strip(), "status": 1 }, extendId) #@系统异常状态码 pipingType = 'err_http_code' row = db.fetchone( "select * from piping_extend where task_id=:task_id and piping_type=:piping_type limit 1", { 'task_id': 0, 'piping_type': pipingType }) if not row: insertRow = { "app_id": 0, "site_id": 0, "task_id": 0, "piping_type": pipingType, "data": "", "status": 1, } extendId = db.insert("piping_extend", insertRow) else: extendId = row["id"] content = "\n".join( ['401', '402', '403', '404', '405', '500', '501', '502', '503', '504']) db.updatebyid("piping_extend", { "data": content.strip(), "status": 1 }, extendId)
def piping_filterword(executeid): '''敏感词过滤''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'filterword'}) if not piping: return True #系统词库 systemWords = '' if piping['filterword_type'] in ['system','mixed']: pipingExtend = db.fetchall('select name from sys_filterword') systemWords = [row['name'] for row in pipingExtend] if pipingExtend else [] # 自有词库 ownWords = '' if piping['filterword_type'] in ['own', 'mixed']: pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) ownWords = pipingExtend['data'] if pipingExtend else '' words = [] if piping['filterword_type'] == 'system': words = systemWords if piping['filterword_type'] == 'own': words = ownWords.split("\n") if piping['filterword_type'] == 'mixed': words = systemWords + ownWords.split('\n') words = list(set(words)) # print(words,type(words)) if '' in words: words.remove('') if not words: return True acism = Acism(words) results = [] rows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id','url', 'file_path', 'file_extension','url_type']) for row in rows: if row['url_type'] != 'self':continue if not (row['file_extension'] == 'html' or row['file_extension'] == ''): continue body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore') # body = open('demo.html', 'r').read() result = acism.scan(body) if result: filename = "snap_code_filterword_%s.png" % row['id'] snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=result.keys()) pipingResult = {"id":row['id'], "url":row['url'], "matches":result, 'snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, pipingResult, snapshots) results.append(pipingResult) if results: return result_save(execute, piping, results) else: return True
def task_start(taskid): try: task = db.fetchone('select * from task where id=:id', {'id': taskid}) if not task: return False startUrls = json.loads(task['start_urls']) executedata = { 'site_id': task['site_id'], 'task_id': task['id'], 'app_id': task['app_id'], 'task_type': task['type'], 'start_urls': task['start_urls'], 'domain': getDomainNoPort(startUrls[0]), 'exec_level': task['exec_level'], 'limit_depth': task['limit_depth'], 'limit_total': task['limit_total'], 'limit_time': task['limit_time'], 'limit_subdomain': task['limit_subdomain'], 'limit_image': task['limit_image'], 'limit_js': task['limit_js'], 'limit_jsevent': task['limit_jsevent'], 'exclude_urls': task['exclude_urls'], 'url_unique_mode': task['url_unique_mode'], 'notify_url': task['notify_url'], 'source_ip': task['source_ip'], 'proxies': task['proxies'], 'status': 0, } executeid = db.insert('task_execute', executedata) return executeid except Exception as e: logger.exception(e) return False
def task_get_id(id): task = db.fetchone('select * from task where id=:id', {'id': id}) if not task: return False task['start_urls'] = _startUrls2Raw(task['start_urls']) task['create_at'] = formatTimestamp(task['create_at']) task['update_at'] = formatTimestamp(task['update_at']) return task
def result_save(execute, piping, results): #数据入库 pipingResult = {} pipingResult['app_id'] = execute['app_id'] pipingResult['site_id'] = execute['site_id'] pipingResult['task_id'] = execute['task_id'] pipingResult['execute_id'] = execute['id'] pipingResult['piping_id'] = piping['id'] pipingResult['type'] = piping['type'] ''' 处理的结果以json字符串的形式保存 包含敏感词,关键字,指纹,错误状态码,暗链 ''' pipingResult['result'] = json.dumps(results, ensure_ascii=False) pipingResult['status'] = 1 pipingResult['audit_status'] = 0 resultOld = db.fetchone('select id from task_piping_result where execute_id=:eid and piping_id=:pid', {'eid': execute['id'], 'pid': piping['id']}) if resultOld: resultId = resultOld['id'] db.updatebyid('task_piping_result', pipingResult, resultId) else: resultId = db.insert('task_piping_result', pipingResult) bNotify.save(execute['id'], 'piping_%s' % piping['type'], {'piping_status':'ok'}) return resultId
def send(notifyid = None): reData = {'status':0, 'msg':'', 'donotify_notifyid':notifyid} row = db.fetchone("select * from task_notify where id=:id", {'id': notifyid}) if not row: return {'status':0, 'msg':'notify[%s] is not exists' % notifyid, 'donotify_notifyid':notifyid} try: data = { 'id': row['id'], 'app_id': row['app_id'], 'task_id': row['task_id'], 'site_id': row['site_id'], 'execute_id': row['execute_id'], 'task_type': row['task_type'], } requestData = json.loads(row['request_data']) if row['request_data'] else {} data = dict(data, **requestData) data = json.dumps(data, ensure_ascii=False) request = Request(row['notify_url'], method='POST') request.add_header('Content-Type', 'application/json') response = urlopen(request, data.encode('utf8'), timeout=5) body = response.read().decode() if body == 'ok': db.updatebyid('task_notify', {'status':'2', 'response_data':body, 'error': ''}, row['id']) else: error = 'the httpcode require 200, the body require ok;' db.updatebyid('task_notify', {'status':'301', 'response_data':body, 'error': error}, row['id']) return {'status':1, 'msg':'notify ok', 'donotify_notifyid':notifyid} except Exception as e: logger.error("doNotify::" + str(notifyid) + "::" + repr(e)) db.updatebyid('task_notify', {'status':'3', 'error':repr(e)}, row['id']) return {'status':1, 'msg':repr(e), 'donotify_notifyid':notifyid}
def getToken_key(key): appObj = db.fetchone('select * from app where unique_key=:key', {'key':key}) if not appObj: return False tokenExpired = getTime("%Y-%m-%d %H:%M:%S" , (getTime() + 7200)) token = md5(tokenExpired) db.updatebyid('app', {'token':token, 'token_expired':tokenExpired}, appObj['id']) return {'token':token, 'expired':tokenExpired}
def _getSiteid(url): '''获取域名ID''' domain = urlparse(url)[1] if ':' in domain: domain = domain.split(':')[0] domainMain = getDomainMain(domain) siteDict = db.fetchone('select * from site where domain=:domain', {'domain': domainMain}) siteid = siteDict['id'] if siteDict else db.insert('site', {'domain': domainMain}) if domain != domainMain: domainDict = db.fetchone( 'select * from domain where subdomain=:domain', {'domain': domain}) if not domainDict: db.insert('domain', {'site_id': siteid, 'subdomain': domain}) return siteid
def get_id(settingid=None): sql = "select * from setting where id=%s" row = db.fetchone(sql, [settingid]) if row: row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) return row else: return False
def get_id(appid=None): row = db.fetchone("select * from app where id=:id", {'id':appid}) if row: row['token_expired'] = formatTimestamp(row['token_expired']) row['create_at'] = formatTimestamp(row['create_at']) row['update_at'] = formatTimestamp(row['update_at']) del(row['public_key']) return row else: return False
def execute_getnew_taskid(taskid): '''根据任务ID获取最新的执行信息''' execute = db.fetchone( 'select * from task_execute where task_id=:id order by id desc', {'id': taskid}) if not execute: return False execute['start_at'] = formatTimestamp(execute['start_at']) execute['end_at'] = formatTimestamp(execute['end_at']) execute['create_at'] = formatTimestamp(execute['create_at']) execute['update_at'] = formatTimestamp(execute['update_at']) return execute
def task_getnew_id(taskid): execute = db.fetchone( 'select * from task_execute where task_id=:task_id order by id desc limit 1', {'task_id': taskid}) execute['start_at'] = formatTimestamp( execute['start_at']) if execute['start_at'] else '' execute['end_at'] = formatTimestamp( execute['end_at']) if execute['end_at'] else '' execute['create_at'] = formatTimestamp(execute['create_at']) execute['update_at'] = formatTimestamp(execute['update_at']) return execute
def execute_get_id(executeid): '''根据执行ID获取执行信息''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) if not execute: return False execute['start_at'] = formatTimestamp( execute['start_at']) if execute['start_at'] else '' execute['end_at'] = formatTimestamp( execute['end_at']) if execute['end_at'] else '' execute['create_at'] = formatTimestamp(execute['create_at']) execute['update_at'] = formatTimestamp(execute['update_at']) return execute
def _wrapper(*args, **kwargs): token = request.headers['token'] appObj = db.fetchone("select * from app where token=:token", {'token': token}) if not appObj: return { 'status': 'failed', 'msg': 'token error', 'token': token }, 200 session['app'] = appObj return func(*args, **kwargs)
def piping_fingerprint(executeid): '''指纹检测''' execute = db.fetchone('select * from task_execute where id=:id', {'id': executeid}) if not execute: return True piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid':execute['task_id'], 'type':'fingerprint'}) if not piping: return True executeOld = db.fetchone('select * from task_execute where task_id=:tid and id<:eid order by id desc', {'tid': execute['task_id'], 'eid':executeid}) if not executeOld: return True exts = ['js','css','jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp'] # 比对指纹 executeOldId = executeOld['id'] mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url','md5_url', 'md5_body', 'status', 'file_extension']) rowsDict = {} for row in mgRows: if row['file_extension'] not in exts: continue rowsDict[row['md5_url']] = row mgRows = mgdb.spiderurl_getall_executeid_status(executeOldId, 2, ['id', 'url','md5_url', 'md5_body', 'status', 'file_extension']) rowsDictOld = {} for row in mgRows: if row['file_extension'] not in exts: continue rowsDictOld[row['md5_url']] = row results = [] for md5Url, row in rowsDict.items(): if md5Url in rowsDictOld.keys(): if row['md5_body'] != rowsDictOld[md5Url]['md5_body']: filename = "snap_view_%s.png" % row['id'] snapshot = _snapshot_save(executeid, 'view', row['url'], filename)[0] pipingResult = { 'url':row['url'], 'md5_url':md5Url, 'md5_body_new':row['md5_body'], 'md5_body_old':rowsDictOld[md5Url]['md5_body'], 'snapshot':snapshot } snapshot_insert(executeid, piping, row, pipingResult, snapshot) results.append(pipingResult) if results: return result_save(execute, piping, results) else: return True
def piping_getall_taskid(taskid = None): ''' 获取数据处理通道''' pipings = {} taskPipings = db.fetchall('select * from task_piping where task_id=:id', {'id': taskid}) for piping in taskPipings: piping['create_at'] = formatTimestamp(piping['create_at']) piping['update_at'] = formatTimestamp(piping['update_at']) pipingType = piping['type'] if pipingType in ['filterword', 'keyword']: pipingExtend = db.fetchone('select * from piping_extend where id=:id', {'id': piping['extend_id']}) piping['words'] = pipingExtend['data'] pipings[pipingType] = piping return pipings
def run(): file_path = 'scripts/createdb.sql' with open(file_path, 'r') as file: script = file.read() with pymssql.connect( server='db.bigmountaintiger.com', port='1433', user='******', password='******', database='master' ) as conn: # Need to set autocommit to run drop/create DB DDLs conn.autocommit(True) with conn.cursor() as cur: cur.execute(script) r = db.fetchone('SELECT * FROM Test_table') print(r)
def print_current(): r = db.fetchone('SELECT * FROM Test_table') print(r)
def piping_darklink(executeid): execute = mgdb.execute_getbyid(executeid) if not execute: return False piping = db.fetchone('select * from task_piping where task_id=:tid and type=:type and status=1', {'tid': execute['task_id'], 'type':'darklink'}) if not piping: return True pipingExtend = db.fetchall('select name from sys_filterword') words = [row['name'] for row in pipingExtend] if pipingExtend else [] # if not words: return True acism = Acism(words) #查询出系统黑白名单表里面的url rows = db.fetchall('select domain from dk_white_list') whites = [row['domain'] for row in rows] if rows else [] rows = db.fetchall('select domain from dk_black_list') blacks = [row['domain'] for row in rows] if rows else [] #拼接白名单和黑名单链接,并去重 whites_glb = list(set(whites)) blacks_glb = list(set(blacks)) #查询出个人黑白名单表里面的url pipingExtend = db.fetchone('select data from piping_extend where id=:id and status=1', {'id': piping['extend_id']}) whites = eval(pipingExtend['data'])['white_list'] blacks = eval(pipingExtend['data'])['black_list'] whites_psl = list(set(whites)) blacks_psl = list(set(blacks)) mgRows = mgdb.spiderurl_getall_executeid_status(executeid, 2, ['id', 'url', 'md5_url', 'md5_body', 'url_type','file_extension','file_path', 'invisible','referer']) results = [] for row in mgRows: # 匹配个人黑白名单里面的url if row['url'] in whites_psl:continue if row['url'] in blacks_psl: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']]) result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) continue # 静态文件不是暗链 if row['url_type'] != 'other':continue if row['file_extension'] not in ['', 'html']:continue # 匹配系统黑白名单(判定结果疑似度百分百 absolute) if row['url'] in whites_glb:continue if row['url'] in blacks_glb: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']]) result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'absolute','snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) continue # 敏感词检测(判定结果疑似度高 high) # if words: body = urlopen("%s/download?filekey=%s" % (DFS_URL, row['file_path'])).read().decode('utf-8','ignore') resultWord = acism.scan(body) if resultWord: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid, 'code', row['file_path'], filename, words=[row['url']]) result = {'id': row['id'], 'referer': row['referer'], 'darklink': row['url'], 'level': 'high','snapshot': "\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) finddata = {'domain':execute['domain'], 'md5_body':row['md5_body']} setdata = {'$set':{'filterwords':json.dumps(resultWord, ensure_ascii=False)}} mongoSpider['outlink'].find_and_modify(finddata, setdata) continue # 检测是否可见(判定结果疑似度低 low) if row['invisible']: words = [params['darklink'] for params in results] if results else [] if row['url'] not in words: filename = 'snap_code_darklink_%s.png' % row['id'] snapshots = _snapshot_save(executeid,'code',row['file_path'], filename, words=[row['url']]) result = {'id':row['id'],'referer':row['referer'],'darklink':row['url'],'level':'low','snapshot':"\n".join(snapshots)} snapshot_insert(executeid, piping, row, result, snapshots) results.append(result) continue # # 检测是否重复(超过引用阈值疑似度高 high/没有超过引用阈值疑似度中 medium) # if row['file_extension']: # body = urlparse() # # 检测引用次数 @未严格定义,待定 # match = {'$match':{'md5_url':row['md5_url']}} # group = {'$group':{'_id':'$domain', 'count':{'$sum':1}}} # results = [i for i in mongoSpider['outlink'].aggregate([match, group])] # if len(results) > 500: if results:return result_save(execute, piping, results) if results else True
def task_save(params=None): if not params['id'] and not params['start_urls']: return False if params['start_urls']: startUrls = params['start_urls'].split("\n") params['start_urls'] = json.dumps(startUrls, ensure_ascii=False) else: params['start_urls'] = '' #默认值 defaultKeys = { 'app_id': 0, 'type': 'spider', 'start_urls': '', 'exec_level': 0, 'limit_depth': 2, 'limit_total': 1000, 'limit_time': 0, 'limit_subdomain': 0, 'limit_image': 0, 'limit_js': 0, 'url_unique_mode': 'url-query', 'notify_url': '', 'exec_level': 0, 'source_ip': '', 'exclude_urls': '', 'proxies': '', 'crontab': '', 'status': 0, } #处理定时任务 rundate = None if 'execute_at' in params.keys() and params['execute_at']: rundate = datetime.strptime(params['execute_at'], '%Y-%m-%d %H:%M:%S') if 'execute_delay' in params.keys() and params['execute_delay']: rundateStr = getTime('%Y-%m-%d %H:%M:%S', getTime() + params['execute_delay']) rundate = datetime.strptime(rundateStr, '%Y-%m-%d %H:%M:%S') #保存数据 taskdata = {} keys = defaultKeys.keys() if params['id']: taskid = params['id'] for key in keys: if key in params.keys() and params[key]: taskdata[key] = params[key] result = db.updatebyid('task', taskdata, taskid) else: taskdata['site_id'] = _getSiteid(startUrls[0]) for key in keys: if key in params.keys() and params[key]: taskdata[key] = params[key] else: taskdata[key] = defaultKeys[key] taskid = db.insert('task', taskdata) #定时任务 func_name = task_start jobid = 'task_%s' % taskid if rundate: job = db.getbyid('scheduler', jobid) if job: db.updatebyid('scheduler', {'run_date': rundate}, jobid) else: scheduler = { 'id': jobid, 'name': jobid, 'func': 'business.task:task_start', 'args': '[' + str(taskid) + ']', 'trigger_type': 'date', 'run_date': rundate, 'coalesce': 0, 'next_run_time': rundate, 'max_instances': 3, 'executor': 'default', 'misfire_grace_time ': 1, } db.insert(scheduler) return taskid #非计划任务 task = db.fetchone("select * from task where id=:id", {'id': taskid}) if not task['crontab']: task_start(taskid) return taskid #删除计划任务 if taskdata['status'] < 1 and taskdata['crontab']: db.exec('delete from scheduler where id=:id', {'id': jobid}) return taskid #添加或修改计划任务 job = db.getbyid('scheduler', jobid) cs = params['crontab'].split(' ') if job: crontab = '0 ' + task['crontab'] + ' * *,SMHdmwWY' db.updatebyid('scheduler', {'crontab': crontab}, jobid) else: tz = pytz.timezone('Asia/Shanghai') scheduler = { 'id': jobid, 'name': jobid, 'func': 'business.task:task_start', 'args': '[' + str(taskid) + ']', 'trigger_type': 'cron', 'crontab': '0 ' + task['crontab'] + ' * *,SMHdmwWY', 'coalesce': 0, 'next_run_time': datetime.now(tz=tz).strftime('%Y-%m-%d %H:%M:%S%z'), 'max_instances': 3, 'executor': 'default', 'misfire_grace_time ': 1, } db.insert('scheduler', scheduler) return taskid
def piping_save(rows=None,taskid=None): task = db.fetchone('select * from task where id=:id', {'id':taskid}) for row in rows: taskPiping = db.fetchone('select * from task_piping where task_id=:tid and type=:type', {'tid':taskid, 'type':row['type']}) extendId = 0 pipingExtendOld = None if taskPiping: extendId = taskPiping['extend_id'] pipingExtendOld = db.fetchone('select * from piping_extend where id=:id', {'id': extendId}) # wordId 值为0,则取系统默认词库 if row['type'] == 'darklink': pipingExtend={} pipingExtend['app_id'] = task['app_id'] pipingExtend['site_id'] = task['site_id'] pipingExtend['task_id'] = taskid pipingExtend['piping_type'] = row['type'] white_list = json.dumps(row['white_list'], ensure_ascii=False) if row['white_list'] else '[]' white_list = {'white_list':eval(white_list)} black_list = json.dumps(row['black_list'], ensure_ascii=False) if row['black_list'] else '[]' black_list = {'black_list':eval(black_list)} pipingExtend['data'] = json.dumps(dict(white_list, **black_list)) pipingExtend['status'] = 1 if pipingExtendOld: db.updatebyid('piping_extend', pipingExtend, extendId) else: extendId = db.insert('piping_extend', pipingExtend) if row['type'] == 'filterword' and 'filterwords' in row.keys() and 'filterword_operate' in row.keys(): words = [] wordsOld = [] wordsNew = row['filterwords'].replace(' ', '').split("\n") if pipingExtendOld: extendId = pipingExtendOld['id'] wordsOld = pipingExtendOld['data'].split("\n") if pipingExtendOld['data'] else [] # 覆盖自有词库 if row['filterword_operate'] == 'own': words = wordsNew # 加词 if row['filterword_operate'] == 'plus': words.extend(wordsNew) if wordsOld: words.extend(wordsOld) # 减词 if row['filterword_operate'] == 'reduce' and wordsOld: wordsCommon = list(set(wordsNew) & set(wordsOld)) for word in wordsCommon: wordsOld.remove(word) words = wordsOld if '' in words: words.remove('') words = list(set(words)) pipingExtend = {} pipingExtend['app_id'] = task['app_id'] pipingExtend['site_id'] = task['site_id'] pipingExtend['task_id'] = taskid pipingExtend['piping_type'] = row['type'] pipingExtend['data'] = "\n".join(words) pipingExtend['status'] = 1 if pipingExtendOld: db.updatebyid('piping_extend', pipingExtend, extendId) else: extendId = db.insert('piping_extend', pipingExtend) # 处理关键字 if row['type'] == 'keyword' and 'keywords' in row.keys(): pipingExtend = {} pipingExtend['app_id'] = task['app_id'] pipingExtend['site_id'] = task['site_id'] pipingExtend['task_id'] = taskid pipingExtend['piping_type'] = row['type'] pipingExtend['data'] = json.dumps(row['keywords'], ensure_ascii=False) if row['keywords'] else '' pipingExtend['status'] = 1 if pipingExtendOld: db.updatebyid('piping_extend', pipingExtend, extendId) else: extendId = db.insert('piping_extend', pipingExtend) # 处理错误状态吗 if row['type'] == 'error_http_code' and 'http_codes' in row.keys(): pipingExtend = {} pipingExtend['app_id'] = task['app_id'] pipingExtend['site_id'] = task['site_id'] pipingExtend['task_id'] = taskid pipingExtend['piping_type'] = row['type'] pipingExtend['data'] = row['http_codes'] pipingExtend['status'] = 1 if pipingExtendOld: db.updatebyid('piping_extend', pipingExtend, extendId) else: extendId = db.insert('piping_extend', pipingExtend) wordType = row['filterword_type'] if 'filterword_type' in row.keys() else '' status = row['status'] if 'status' in row.keys() else 1 piping = {} piping['status'] = status piping['extend_id'] = extendId piping['filterword_type'] = wordType if taskPiping: pipingId = db.updatebyid('task_piping', piping, taskPiping['id']) else: piping['app_id'] = task['app_id'] piping['site_id'] = task['site_id'] piping['task_id'] = taskid piping['type'] = row['type'] pipingId = db.insert('task_piping', piping) return True
def crawl(urlInfo): uI = urlInfo execute = mgdb.execute_getbyid(urlInfo['execute_id']) if not execute: return False sql = "select * from task_piping where task_id=:task_id and type=:type and status=:status" pipingDark = db.fetchone(sql, {'task_id': execute['task_id'], 'type': 'darklink', 'status': 1}) try: ##如果任务已结束,则返回 #if execute['status'] == 2 or urlInfo['status'] == 2: # return True logger.info("crawl:uid[%s]:tid[%s]:eid[%s]:method[%s]::%s" % ( uI['id'], uI['task_id'], uI['execute_id'], uI['method'], uI['url'] )) # 抓取页面,解析数据 response = {} urlItems = [] #proxy = {'url':'http://%s' % MIRROR_PROXY} if execute['task_type'] == 'mirror' else {} proxy = {} requestInfo = spiderRequest(urlInfo['url'], urlInfo['method'], urlInfo['request_headers'], proxy=proxy) # 请求错误,直接返回 if requestInfo['error']: mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id']) return True # 304或其他状态码,直接返回 if requestInfo['http_code'] != 200: mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id']) return True # 正常请求 responseHeaders = requestInfo['response_headers'] contentTypeRaw = responseHeaders['Content-Type'] if 'Content-Type' in responseHeaders.keys() else None contentType = parseContentType(contentTypeRaw, default = 'text/html') fileType = mime2file(contentType) #logger.debug("Content-Type::::::::" + contentTypeRaw + "::::" + contentType) #保存响应信息 fileInfo = download(requestInfo, urlInfo, execute, fileType) response = _formatResponse(requestInfo, execute, urlInfo, fileInfo) mgdb.spiderurl_save(response, urlInfo['id']) #非html页面,直接返回 if fileType != 'html': return True #外部连接,不再进一步分析 if urlInfo['url_type'] != 'self': return True # 如果是单页面镜像,不分析页面 if execute['task_type'] == 'mirror_one': return True #正则解析页面 urlItems = parse_reg(requestInfo) #检测暗链 if pipingDark: result = parse_darklink(requestInfo['url']) # logger.info('parse_darklink::::%s::::' % (result)) darklinks = _formatUrls(result, 1) if result else [] urlItems = urlItems + darklinks ''' 浏览器解析部分 ''' #if execute['limit_js']: # results = parse_browser(requestInfo) # if results: urlItems = urlItems + results # logger.info('parse_darklink::::%s::::%s' % ('urls_uniq', json.dumps(urlItems))) # url去重 urlItems = _urls_uniq(urlItems) # 追加新的URL undos = [] mirrors = [] queueOut = [] outlinks = [] queueSite = [] # logger.info('parse_darklink::::%s::::' % (urlItems)) # logger.info('parse_darklink::::%s::::%s' % ('urlItems', json.dumps(urlItems))) for row in urlItems: url = row['url'].strip() if not isUrl(url): continue fileExtension = extension(url) urlType = _getDomainType(url, execute['domain']) # isExists = _checkUrlExists(execute['id'], url, row['method']) isExists = _checkUrlExists(execute['id'], url, row['method'], row['invisible']) if isExists: continue flagOutlink = 0 item = {} item['site_id'] = execute['site_id'] item['task_id'] = execute['task_id'] item['app_id'] = execute['app_id'] item['execute_id'] = execute['id'] item['task_type'] = execute['task_type'] item['url'] = url item['url_type'] = urlType item['file_extension'] = fileExtension item['method'] = row['method'] item['invisible'] = row['invisible'] item['post'] = json.dumps(row['post'], ensure_ascii=False) if row['post'] else '' # 非本站链接或不分析暗链,状态标为5,即不需要抓取 item['status'] = 5 if urlType == 'self': item['status'] = 0 else: if fileExtension in staticExts: item['status'] = 0 else: if pipingDark: flagOutlink = 1 item['status'] = 0 if urlType == 'other': outlinks.append(_formatOutlink(execute, urlInfo['url'], url, row['invisible'])) item['referer'] = urlInfo['url'] item['exec_level'] = execute['exec_level'] item['depth'] = int(urlInfo['depth']) + 1 item['query'] = row['query'] item['pattern_path'] = row['pattern_path'] item['pattern_query'] = row['pattern_query'] item['create_at'] = now_format() item['update_at'] = now_format() if flagOutlink: queueOut.append(item) else: queueSite.append(item) # logger.info('22parse_darklink::::%s::::%s' % ('queueSite', json.dumps(queueSite))) # logger.info('22parse_darklink::::%s::::%s' % ('queueOut', json.dumps(queueOut))) if urlItems: mgdb.c_insert('parse', _formatParse(execute, urlInfo, urlItems, response['md5_body'], 'regular')) if outlinks: mgdb.c_insert_batch('outlink', outlinks) stats = Mq.get_stats_batch('spider', execute['id']) if queueSite: # logger.info('parse_darklink::::::::%s' % (queueSite)) results = mgdb.c_insert_batch('spiderurl', queueSite) for item in results: # 状态位非0,不抓取 if item['status'] != 0: continue # 深度超过限制,不抓取 if item['depth'] > execute['limit_depth']: continue # 总数超过限制,不抓取 if stats['total'] > execute['limit_total']: continue # 镜像,不抓取图片 if execute['task_type'] == 'mirror' and item['file_extension'] in staticExts: continue # 单页面监测,不抓取子页面 if execute['task_type'] in ['monitor_one', 'mirror_one'] and item['file_extension'] not in staticExts: continue # 不抓取图片 if not execute['limit_image'] and item['file_extension'] in staticExts: continue item[batchKey] = item['execute_id'] item[mqidKey] = item['id'] #数据放入待抓取队列 undos.append(item) #数据放入镜像队列 if execute['task_type'] == 'mirror': mirrors.append(item) if queueOut: # logger.info('parse_darklink::::::::%s' % (queueOut)) results = mgdb.c_insert_batch('spiderurl', queueOut) for item in results: item[batchKey] = item['execute_id'] item[mqidKey] = item['id'] undos.append(item) if undos: Mq.produce(undos, 'spider') if mirrors: Mq.produce(mirrors, 'mirror') except Exception as e: logger.exception(e) return False