def _formatResponse(requestInfo, execute, urlInfo, fileInfo=('','')): '''格式化UrlItem''' try: response = {} response['nettime'] = requestInfo['nettime'] if requestInfo['error'] or requestInfo['http_code'] != 200: response['status'] = 3 response['http_code'] = requestInfo['http_code'] response['error'] = repr(requestInfo['error']) response['end_at'] = now_format() response['depth'] = urlInfo['depth'] else: response['status'] = 2 response['end_at'] = now_format() response['depth'] = urlInfo['depth'] response['md5_url'] = md5(urlInfo['url']) response['md5_body'] = md5(requestInfo['body']) response['redirects'] = json.dumps(requestInfo['redirects'], ensure_ascii=False) response['http_code'] = requestInfo['http_code'] response['request_headers'] = json.dumps(requestInfo['request_headers'], ensure_ascii=False) response['response_headers'] = json.dumps(requestInfo['response_headers'], ensure_ascii=False) response['file_name'] = fileInfo[0] response['file_path'] = fileInfo[1] return response except Exception as e: logger.exception(e) return False
def init_app(): '''只系统上线,初始化时使用,谨慎操作''' return True #tsgz appid = 2 app = { 'id': appid, 'unique_key': 'tsgz', 'public_key': "-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEA9Vp7hhFpJe2zYuGDDBQ2wb0e7tKHwfHdE6e8ZUJDkMgPLKBEbHwo\nSuvLXgrtGqjclVSIn6Py+NmQbtWxnOZuV/2O/jzhnflu8vVoXVwEuj4gj3+jGZV4\nB0MFICeZ/+qM2UcqrquxQrLhV1gU8InaaTgkMtC4Iag38YdDUy6MdBH7yOQzmUuq\nd5PhbsZeb45Y2OSuq2jhg3d1Xu1vHIrj1A0jSs99d5lOdubpCu7l1JC3WrjVBISj\nlQnrQmUATVy6Tr0Wvv8n1hqaZVNGpAM6pI4UtF+OldU7MrNqQzc+8a5hj2A2SGZE\nfPgyjaS8p+/K4tECY0STfXtB7wjg8oU8bQIDAQAB\n-----END RSA PUBLIC KEY-----", 'token': '9a684815a09c65edb52b7612cda4b1ad', 'token_expired': now_format(), } row = db.getbyid('app', appid) if row: db.updatebyid('app', app, appid) else: db.insert('app', app) #homev5_apiv4 appid = 3 app = { 'id': appid, 'unique_key': 'homev5_apiv4', 'public_key': "-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEA7LRjexk787YP48ZiWOwHNa93VF+J0H/pdINSvqIqWU6yAarpkLWq\nKV9Xd27QCcK6z459b0v/S6QplOPWks7m0MCFrflxkAEjd6MtXJiq3a6rcnX1w0vu\nPozNcM8ibLQI6XoSWNx2sUlQDpDdT9JvdGsnoCfY+pAS3gycgAHzFJH9UbY68igk\nn1cqFuADso3YLXZssK+eslnsfK20iZPiobmSWACLz0vi0gxTABSLqXM3ovJBZgiB\n0QUqvKJY1pM0dHpVpnj73y3CutqH+v255x32y2DVfG4AC6hxCojIhQDx8vAqsKc1\nHYcKxCTPGGVGGvmDUDevwvmvF+GjDZ0SQQIDAQAB\n-----END RSA PUBLIC KEY-----", 'token': '20d812f96badf9f811cde6f9916d5a50', 'token_expired': now_format(), } row = db.getbyid('app', appid) if row: db.updatebyid('app', app, appid) else: db.insert('app', app) #homev5_apiv4_mirror appid = 4 app = { 'id': appid, 'unique_key': 'homev5_apiv4_mirror', 'public_key': "-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEAwT90eOKM9YaUDYM1v2WR1TL7Qf1t3e3ogCkFSSbH0D4IBn/bVOi9\nCq7jDRZH9F75j6uGXymMLGF841kgrgn8NdyalqaLGRrufw+K971UfNToT/SEAW9O\n+HlZLIV+itAVbBly5/LJFc16aPUH2L47r8qFIFB0PfjLSAsHhbRRs6jLyuZTtzGi\no4iod7/5R+ip216fu7cxiAE3wBhfKTT7IYnAnW7+tYLPqlGcszJkSJtZozHcxudw\n4nVRu+2pkP9ud1YnbWSVGDADMQ33YaKSrm4O+dCDw5EqhmYo+0xH39TNS/2GjCK2\n83R0ZvuS9KkYCNhSYYEKVKiyTuavTpsWWwIDAQAB\n-----END RSA PUBLIC KEY-----", 'token': '69866dde69bced6b006708b936e038c3', 'token_expired': now_format(), } row = db.getbyid('app', appid) if row: db.updatebyid('app', app, appid) else: db.insert('app', app) return True
def _formatStatic(domain, url, filename, filekey, filetype, md5Body): return { 'domain': domain, 'url': url, 'file_name': filename, 'file_key': filekey, 'file_type': filetype, 'md5_url': md5(url), 'md5_body': md5Body, 'create_at': now_format(), 'update_at': now_format(), }
def _formatParse(execute, urlInfo, result, md5Body, parseType='regular'): return { 'site_id': execute['site_id'], 'task_id': execute['task_id'], 'app_id': execute['app_id'], 'execute_id': execute['id'], 'url_id': urlInfo['id'], 'referer': urlInfo['referer'], 'url': urlInfo['url'], 'md5_url': urlInfo['md5_url'], 'md5_body': md5Body, 'parse_type': 'regular', 'result': result, 'create_at': now_format(), 'update_at': now_format() }
def _formatOutlink(execute, referer, url, md5Body='', invisible=0): '''格式化外链''' return { 'task_id': execute['task_id'], 'execute_id': execute['id'], 'domain': execute['domain'], 'referer': referer, 'md5_referer': md5(referer), 'url': url, 'md5_url': md5(url), 'md5_body': md5Body, 'invisible': invisible, 'filterwords': '', 'date':getDate(), 'create_at': now_format(), 'update_at': now_format() }
def save(params): '''保存快照''' item = { 'app_key': params['app_key'], 'batch': params['batch'], 'uuid': uuid1().hex, 'type': params['type'], 'url': params['url'], 'filename': params['filename'], 'words': ",".join(params['words']), 'proxy': params['proxy'], 'notify_url': params['notify_url'], 'error': '', 'status': 0, 'create_at': now_format(), 'update_at': now_format(), } if item['type'] == 'code': downloadUrl = '%s/download?filekey=%s' % (DFS_URL, params['url']) pagesize = 800 body = urlopen(downloadUrl).read().decode('utf-8', 'ignore') total = len(body.split("\n")) pageTotal = math.ceil(total / pagesize) url = '%s/%s/%s' % (QINIU_DOMAIN, QINIU_KEY_PREFIX, basename(params['filename'])) snapshots = [ url.replace('.png', '_%s.png' % i) for i in list(range(pageTotal)) ] item['snapshot'] = "\n".join(snapshots) else: item['snapshot'] = '%s/%s/%s' % (QINIU_DOMAIN, QINIU_KEY_PREFIX, basename(params['filename'])) itemid = mgdb.c_insert('snapshot', deepcopy(item)) item['id'] = itemid item[mqidKey] = item['id'] item[batchKey] = item['batch'] Mq.produce([item], 'snapshot') return item['snapshot'].split("\n")
def consume(mqkey=''): '''消费任务''' if not mqkey: return False readyKey = 'mq_%s_ready' % mqkey doingKey = 'mq_%s_doing' % mqkey readyKeyRedis = 'mq_%s_ready' % mqkey jsonRaw = redis.lpop(readyKeyRedis) if not jsonRaw: return False item = json.loads(jsonRaw.decode('UTF8')) #写入 batchid runBatch = mongoMq['stats_batch_run'].find_one({ 'mqkey': mqkey, 'batch': item[batchKey] }) if not runBatch: mongoMq['stats_batch_run'].insert({ "mqkey": mqkey, "batch": item[batchKey], "is_end": 0, "start_at": now_format(), "end_at": "" }) #更新当前队列ID title = _get_proc_title() mongoMq['process_list'].find_and_modify( { 'hostname': hostname, 'title': title[7:] }, {'$set': { mqidKey: item[mqidKey] }}) mongoMq[doingKey].find_and_modify({mqidKey: item[mqidKey]}, {'$set': item}, upsert=True) mongoMq[readyKey].delete_many({mqidKey: item[mqidKey]}) mongoMq['stats_mq'].find_and_modify({'mq_key': mqkey}, {"$inc": { "ready": -1, "doing": 1 }}) mongoMq['stats_batch_stage'].find_and_modify( { "mqkey": mqkey, 'batch': item[batchKey] }, {"$inc": { "ready": -1, "doing": 1 }}) return item
def run(self): #log_to_stderr() #logger = multiprocessing.get_logger().setLevel(logging.INFO) if hostname[-6:] != 'master': return False #进程启动时,初始化数据 while True: processes = [] removes = [] exists = [] for row in mongoMq[pConfigKey].find({}, {"_id": 0}): hostnameT = row['hostname'] status = row['status'] titles = [] for key, value in row.items(): if key in ['hostname', 'status']: continue for index in list(range(value)): title = "%s-%s" % (key, index + 1) processes.append({ "hostname": hostnameT, "title": title, "processid": 0, "mqkey": key, "status": status, "mqid": 0, "update": now_format().replace('-', '').replace(':', '').replace( ' ', '') }) titles.append(title) titleExists = [ i['title'] for i in mongoMq[pListKey].find( {"hostname": hostnameT}, {'_id': 0}) ] for title in list(set(titleExists) - set(titles)): removes.append({'hostname': hostnameT, 'title': title}) for row in processes: proc = mongoMq[pListKey].find_one({ "hostname": row['hostname'], "title": row['title'] }) if proc: mongoMq[pListKey].find_and_modify( { "hostname": row['hostname'], "title": row['title'] }, { '$set': { "status": row['status'], "update": row['update'] } }) else: mongoMq[pListKey].insert(row) for row in removes: mongoMq[pListKey].find_and_modify( { 'hostname': row['hostname'], 'title': row['title'] }, {'$set': { 'status': 'disable' }}) sleep(10)
def crawl(urlInfo): uI = urlInfo execute = mgdb.execute_getbyid(urlInfo['execute_id']) if not execute: return False sql = "select * from task_piping where task_id=:task_id and type=:type and status=:status" pipingDark = db.fetchone(sql, {'task_id': execute['task_id'], 'type': 'darklink', 'status': 1}) try: ##如果任务已结束,则返回 #if execute['status'] == 2 or urlInfo['status'] == 2: # return True logger.info("crawl:uid[%s]:tid[%s]:eid[%s]:method[%s]::%s" % ( uI['id'], uI['task_id'], uI['execute_id'], uI['method'], uI['url'] )) # 抓取页面,解析数据 response = {} urlItems = [] #proxy = {'url':'http://%s' % MIRROR_PROXY} if execute['task_type'] == 'mirror' else {} proxy = {} requestInfo = spiderRequest(urlInfo['url'], urlInfo['method'], urlInfo['request_headers'], proxy=proxy) # 请求错误,直接返回 if requestInfo['error']: mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id']) return True # 304或其他状态码,直接返回 if requestInfo['http_code'] != 200: mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id']) return True # 正常请求 responseHeaders = requestInfo['response_headers'] contentTypeRaw = responseHeaders['Content-Type'] if 'Content-Type' in responseHeaders.keys() else None contentType = parseContentType(contentTypeRaw, default = 'text/html') fileType = mime2file(contentType) #logger.debug("Content-Type::::::::" + contentTypeRaw + "::::" + contentType) #保存响应信息 fileInfo = download(requestInfo, urlInfo, execute, fileType) response = _formatResponse(requestInfo, execute, urlInfo, fileInfo) mgdb.spiderurl_save(response, urlInfo['id']) #非html页面,直接返回 if fileType != 'html': return True #外部连接,不再进一步分析 if urlInfo['url_type'] != 'self': return True # 如果是单页面镜像,不分析页面 if execute['task_type'] == 'mirror_one': return True #正则解析页面 urlItems = parse_reg(requestInfo) #检测暗链 if pipingDark: result = parse_darklink(requestInfo['url']) # logger.info('parse_darklink::::%s::::' % (result)) darklinks = _formatUrls(result, 1) if result else [] urlItems = urlItems + darklinks ''' 浏览器解析部分 ''' #if execute['limit_js']: # results = parse_browser(requestInfo) # if results: urlItems = urlItems + results # logger.info('parse_darklink::::%s::::%s' % ('urls_uniq', json.dumps(urlItems))) # url去重 urlItems = _urls_uniq(urlItems) # 追加新的URL undos = [] mirrors = [] queueOut = [] outlinks = [] queueSite = [] # logger.info('parse_darklink::::%s::::' % (urlItems)) # logger.info('parse_darklink::::%s::::%s' % ('urlItems', json.dumps(urlItems))) for row in urlItems: url = row['url'].strip() if not isUrl(url): continue fileExtension = extension(url) urlType = _getDomainType(url, execute['domain']) # isExists = _checkUrlExists(execute['id'], url, row['method']) isExists = _checkUrlExists(execute['id'], url, row['method'], row['invisible']) if isExists: continue flagOutlink = 0 item = {} item['site_id'] = execute['site_id'] item['task_id'] = execute['task_id'] item['app_id'] = execute['app_id'] item['execute_id'] = execute['id'] item['task_type'] = execute['task_type'] item['url'] = url item['url_type'] = urlType item['file_extension'] = fileExtension item['method'] = row['method'] item['invisible'] = row['invisible'] item['post'] = json.dumps(row['post'], ensure_ascii=False) if row['post'] else '' # 非本站链接或不分析暗链,状态标为5,即不需要抓取 item['status'] = 5 if urlType == 'self': item['status'] = 0 else: if fileExtension in staticExts: item['status'] = 0 else: if pipingDark: flagOutlink = 1 item['status'] = 0 if urlType == 'other': outlinks.append(_formatOutlink(execute, urlInfo['url'], url, row['invisible'])) item['referer'] = urlInfo['url'] item['exec_level'] = execute['exec_level'] item['depth'] = int(urlInfo['depth']) + 1 item['query'] = row['query'] item['pattern_path'] = row['pattern_path'] item['pattern_query'] = row['pattern_query'] item['create_at'] = now_format() item['update_at'] = now_format() if flagOutlink: queueOut.append(item) else: queueSite.append(item) # logger.info('22parse_darklink::::%s::::%s' % ('queueSite', json.dumps(queueSite))) # logger.info('22parse_darklink::::%s::::%s' % ('queueOut', json.dumps(queueOut))) if urlItems: mgdb.c_insert('parse', _formatParse(execute, urlInfo, urlItems, response['md5_body'], 'regular')) if outlinks: mgdb.c_insert_batch('outlink', outlinks) stats = Mq.get_stats_batch('spider', execute['id']) if queueSite: # logger.info('parse_darklink::::::::%s' % (queueSite)) results = mgdb.c_insert_batch('spiderurl', queueSite) for item in results: # 状态位非0,不抓取 if item['status'] != 0: continue # 深度超过限制,不抓取 if item['depth'] > execute['limit_depth']: continue # 总数超过限制,不抓取 if stats['total'] > execute['limit_total']: continue # 镜像,不抓取图片 if execute['task_type'] == 'mirror' and item['file_extension'] in staticExts: continue # 单页面监测,不抓取子页面 if execute['task_type'] in ['monitor_one', 'mirror_one'] and item['file_extension'] not in staticExts: continue # 不抓取图片 if not execute['limit_image'] and item['file_extension'] in staticExts: continue item[batchKey] = item['execute_id'] item[mqidKey] = item['id'] #数据放入待抓取队列 undos.append(item) #数据放入镜像队列 if execute['task_type'] == 'mirror': mirrors.append(item) if queueOut: # logger.info('parse_darklink::::::::%s' % (queueOut)) results = mgdb.c_insert_batch('spiderurl', queueOut) for item in results: item[batchKey] = item['execute_id'] item[mqidKey] = item['id'] undos.append(item) if undos: Mq.produce(undos, 'spider') if mirrors: Mq.produce(mirrors, 'mirror') except Exception as e: logger.exception(e) return False
def execute_init(eid): '''本函数允许重复执行''' execute = db.getbyid('task_execute', eid) if not execute: return False execute['create_at'] = formatTimestamp( execute['create_at']) if execute['create_at'] else '' execute['update_at'] = formatTimestamp( execute['update_at']) if execute['update_at'] else '' execute['start_at'] = formatTimestamp( execute['start_at']) if execute['start_at'] else '' execute['end_at'] = formatTimestamp( execute['end_at']) if execute['end_at'] else '' execute['status'] = 101 mgExecute = mgdb.execute_getbyid(eid) if not mgExecute: execute_spider = deepcopy(execute) mgdb.c_insert('execute', execute_spider, autoid=False) startUrls = json.loads(execute['start_urls']) startUrlsLen = len(startUrls) urlCount = mongoSpider['spiderurl'].find({ 'execute_id': eid }, { '_id': 0 }).count() if startUrlsLen > urlCount: urlRows = [] for url in startUrls: urldata = { 'site_id': execute['site_id'], 'task_id': execute['task_id'], 'app_id': execute['app_id'], 'task_type': execute['task_type'], 'execute_id': eid, 'exec_level': execute['exec_level'], 'url': url, 'url_type': 'self', 'method': 'get', 'status': 0, 'create_at': now_format(), 'update_at': now_format(), } urlRows.append(urldata) mgdb.c_insert_batch('spiderurl', urlRows) undos = [ i for i in mongoSpider['spiderurl'].find({'execute_id': eid}, {'_id': 0}) ] undos_spider = [] undos_mirror = [] for undo in undos: undo[mqidKey] = undo['id'] undo[batchKey] = undo['execute_id'] undos_spider.append(undo) undos_mirror.append(undo) pre = 'mq_spider_' stages = ['undo', 'ready', 'doing', 'done'] stats = { stage: mongoMq[pre + stage].find({ batchKey: eid }).count() for stage in stages } total = stats['undo'] + stats['ready'] + stats['doing'] + stats['done'] if startUrlsLen > total: #添加spider队列 Mq.produce(undos_spider, 'spider') #添加mirror队列 if execute['task_type'] == 'mirror': Mq.produce(undos_mirror, 'mirror') if not mgExecute: db.updatebyid('task_execute', {'status': 101}, eid) return True