예제 #1
0
def _formatResponse(requestInfo, execute, urlInfo, fileInfo=('','')):
    '''格式化UrlItem'''
    try:
        response = {}
        response['nettime'] =  requestInfo['nettime']
        if requestInfo['error'] or requestInfo['http_code'] != 200:
            response['status'] =  3
            response['http_code'] = requestInfo['http_code']
            response['error'] = repr(requestInfo['error'])
            response['end_at'] =  now_format()
            response['depth'] =  urlInfo['depth']
        else:
            response['status'] =  2
            response['end_at'] =  now_format()
            response['depth'] =  urlInfo['depth']
            response['md5_url'] =  md5(urlInfo['url'])
            response['md5_body'] =  md5(requestInfo['body'])
            response['redirects'] =  json.dumps(requestInfo['redirects'], ensure_ascii=False)
            response['http_code'] =  requestInfo['http_code']
            response['request_headers'] =  json.dumps(requestInfo['request_headers'], ensure_ascii=False)
            response['response_headers'] =  json.dumps(requestInfo['response_headers'], ensure_ascii=False)
            response['file_name'] = fileInfo[0]
            response['file_path'] = fileInfo[1]
        return response
    except Exception as e:
        logger.exception(e)
        return False
예제 #2
0
파일: devops.py 프로젝트: xx1820017/lixiang
def init_app():
    '''只系统上线,初始化时使用,谨慎操作'''
    return True

    #tsgz
    appid = 2
    app = {
        'id': appid,
        'unique_key': 'tsgz',
        'public_key':
        "-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEA9Vp7hhFpJe2zYuGDDBQ2wb0e7tKHwfHdE6e8ZUJDkMgPLKBEbHwo\nSuvLXgrtGqjclVSIn6Py+NmQbtWxnOZuV/2O/jzhnflu8vVoXVwEuj4gj3+jGZV4\nB0MFICeZ/+qM2UcqrquxQrLhV1gU8InaaTgkMtC4Iag38YdDUy6MdBH7yOQzmUuq\nd5PhbsZeb45Y2OSuq2jhg3d1Xu1vHIrj1A0jSs99d5lOdubpCu7l1JC3WrjVBISj\nlQnrQmUATVy6Tr0Wvv8n1hqaZVNGpAM6pI4UtF+OldU7MrNqQzc+8a5hj2A2SGZE\nfPgyjaS8p+/K4tECY0STfXtB7wjg8oU8bQIDAQAB\n-----END RSA PUBLIC KEY-----",
        'token': '9a684815a09c65edb52b7612cda4b1ad',
        'token_expired': now_format(),
    }
    row = db.getbyid('app', appid)
    if row:
        db.updatebyid('app', app, appid)
    else:
        db.insert('app', app)

    #homev5_apiv4
    appid = 3
    app = {
        'id': appid,
        'unique_key': 'homev5_apiv4',
        'public_key':
        "-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEA7LRjexk787YP48ZiWOwHNa93VF+J0H/pdINSvqIqWU6yAarpkLWq\nKV9Xd27QCcK6z459b0v/S6QplOPWks7m0MCFrflxkAEjd6MtXJiq3a6rcnX1w0vu\nPozNcM8ibLQI6XoSWNx2sUlQDpDdT9JvdGsnoCfY+pAS3gycgAHzFJH9UbY68igk\nn1cqFuADso3YLXZssK+eslnsfK20iZPiobmSWACLz0vi0gxTABSLqXM3ovJBZgiB\n0QUqvKJY1pM0dHpVpnj73y3CutqH+v255x32y2DVfG4AC6hxCojIhQDx8vAqsKc1\nHYcKxCTPGGVGGvmDUDevwvmvF+GjDZ0SQQIDAQAB\n-----END RSA PUBLIC KEY-----",
        'token': '20d812f96badf9f811cde6f9916d5a50',
        'token_expired': now_format(),
    }
    row = db.getbyid('app', appid)
    if row:
        db.updatebyid('app', app, appid)
    else:
        db.insert('app', app)

    #homev5_apiv4_mirror
    appid = 4
    app = {
        'id': appid,
        'unique_key': 'homev5_apiv4_mirror',
        'public_key':
        "-----BEGIN RSA PUBLIC KEY-----\nMIIBCgKCAQEAwT90eOKM9YaUDYM1v2WR1TL7Qf1t3e3ogCkFSSbH0D4IBn/bVOi9\nCq7jDRZH9F75j6uGXymMLGF841kgrgn8NdyalqaLGRrufw+K971UfNToT/SEAW9O\n+HlZLIV+itAVbBly5/LJFc16aPUH2L47r8qFIFB0PfjLSAsHhbRRs6jLyuZTtzGi\no4iod7/5R+ip216fu7cxiAE3wBhfKTT7IYnAnW7+tYLPqlGcszJkSJtZozHcxudw\n4nVRu+2pkP9ud1YnbWSVGDADMQ33YaKSrm4O+dCDw5EqhmYo+0xH39TNS/2GjCK2\n83R0ZvuS9KkYCNhSYYEKVKiyTuavTpsWWwIDAQAB\n-----END RSA PUBLIC KEY-----",
        'token': '69866dde69bced6b006708b936e038c3',
        'token_expired': now_format(),
    }
    row = db.getbyid('app', appid)
    if row:
        db.updatebyid('app', app, appid)
    else:
        db.insert('app', app)

    return True
예제 #3
0
def _formatStatic(domain, url, filename, filekey, filetype, md5Body):
    return {
        'domain': domain,
        'url': url,
        'file_name': filename,
        'file_key': filekey,
        'file_type': filetype,
        'md5_url': md5(url),
        'md5_body': md5Body,
        'create_at': now_format(),
        'update_at': now_format(),
    }
예제 #4
0
def _formatParse(execute, urlInfo, result, md5Body, parseType='regular'):
    return {
        'site_id': execute['site_id'],
        'task_id': execute['task_id'],
        'app_id': execute['app_id'],
        'execute_id': execute['id'],
        'url_id': urlInfo['id'],
        'referer': urlInfo['referer'],
        'url': urlInfo['url'],
        'md5_url': urlInfo['md5_url'],
        'md5_body': md5Body,
        'parse_type': 'regular',
        'result': result,
        'create_at': now_format(),
        'update_at': now_format()
    }
예제 #5
0
def _formatOutlink(execute, referer, url, md5Body='', invisible=0):
    '''格式化外链'''
    return {
        'task_id': execute['task_id'],
        'execute_id': execute['id'],
        'domain': execute['domain'],
        'referer': referer,
        'md5_referer': md5(referer),
        'url': url, 
        'md5_url': md5(url),
        'md5_body': md5Body, 
        'invisible': invisible, 
        'filterwords': '', 
        'date':getDate(), 
        'create_at': now_format(),
        'update_at': now_format()
    }
예제 #6
0
def save(params):
    '''保存快照'''
    item = {
        'app_key': params['app_key'],
        'batch': params['batch'],
        'uuid': uuid1().hex,
        'type': params['type'],
        'url': params['url'],
        'filename': params['filename'],
        'words': ",".join(params['words']),
        'proxy': params['proxy'],
        'notify_url': params['notify_url'],
        'error': '',
        'status': 0,
        'create_at': now_format(),
        'update_at': now_format(),
    }
    if item['type'] == 'code':
        downloadUrl = '%s/download?filekey=%s' % (DFS_URL, params['url'])
        pagesize = 800
        body = urlopen(downloadUrl).read().decode('utf-8', 'ignore')
        total = len(body.split("\n"))
        pageTotal = math.ceil(total / pagesize)
        url = '%s/%s/%s' % (QINIU_DOMAIN, QINIU_KEY_PREFIX,
                            basename(params['filename']))
        snapshots = [
            url.replace('.png', '_%s.png' % i) for i in list(range(pageTotal))
        ]
        item['snapshot'] = "\n".join(snapshots)
    else:
        item['snapshot'] = '%s/%s/%s' % (QINIU_DOMAIN, QINIU_KEY_PREFIX,
                                         basename(params['filename']))

    itemid = mgdb.c_insert('snapshot', deepcopy(item))
    item['id'] = itemid
    item[mqidKey] = item['id']
    item[batchKey] = item['batch']
    Mq.produce([item], 'snapshot')
    return item['snapshot'].split("\n")
예제 #7
0
def consume(mqkey=''):
    '''消费任务'''
    if not mqkey: return False
    readyKey = 'mq_%s_ready' % mqkey
    doingKey = 'mq_%s_doing' % mqkey
    readyKeyRedis = 'mq_%s_ready' % mqkey
    jsonRaw = redis.lpop(readyKeyRedis)
    if not jsonRaw: return False
    item = json.loads(jsonRaw.decode('UTF8'))

    #写入 batchid
    runBatch = mongoMq['stats_batch_run'].find_one({
        'mqkey': mqkey,
        'batch': item[batchKey]
    })
    if not runBatch:
        mongoMq['stats_batch_run'].insert({
            "mqkey": mqkey,
            "batch": item[batchKey],
            "is_end": 0,
            "start_at": now_format(),
            "end_at": ""
        })

    #更新当前队列ID
    title = _get_proc_title()
    mongoMq['process_list'].find_and_modify(
        {
            'hostname': hostname,
            'title': title[7:]
        }, {'$set': {
            mqidKey: item[mqidKey]
        }})

    mongoMq[doingKey].find_and_modify({mqidKey: item[mqidKey]}, {'$set': item},
                                      upsert=True)
    mongoMq[readyKey].delete_many({mqidKey: item[mqidKey]})
    mongoMq['stats_mq'].find_and_modify({'mq_key': mqkey},
                                        {"$inc": {
                                            "ready": -1,
                                            "doing": 1
                                        }})
    mongoMq['stats_batch_stage'].find_and_modify(
        {
            "mqkey": mqkey,
            'batch': item[batchKey]
        }, {"$inc": {
            "ready": -1,
            "doing": 1
        }})
    return item
예제 #8
0
파일: ctl.py 프로젝트: xx1820017/lixiang
    def run(self):
        #log_to_stderr()
        #logger = multiprocessing.get_logger().setLevel(logging.INFO)
        if hostname[-6:] != 'master':
            return False

        #进程启动时,初始化数据
        while True:
            processes = []
            removes = []
            exists = []
            for row in mongoMq[pConfigKey].find({}, {"_id": 0}):
                hostnameT = row['hostname']
                status = row['status']
                titles = []
                for key, value in row.items():
                    if key in ['hostname', 'status']: continue
                    for index in list(range(value)):
                        title = "%s-%s" % (key, index + 1)
                        processes.append({
                            "hostname":
                            hostnameT,
                            "title":
                            title,
                            "processid":
                            0,
                            "mqkey":
                            key,
                            "status":
                            status,
                            "mqid":
                            0,
                            "update":
                            now_format().replace('-', '').replace(':',
                                                                  '').replace(
                                                                      ' ', '')
                        })
                        titles.append(title)

                titleExists = [
                    i['title'] for i in mongoMq[pListKey].find(
                        {"hostname": hostnameT}, {'_id': 0})
                ]
                for title in list(set(titleExists) - set(titles)):
                    removes.append({'hostname': hostnameT, 'title': title})

            for row in processes:
                proc = mongoMq[pListKey].find_one({
                    "hostname": row['hostname'],
                    "title": row['title']
                })
                if proc:
                    mongoMq[pListKey].find_and_modify(
                        {
                            "hostname": row['hostname'],
                            "title": row['title']
                        }, {
                            '$set': {
                                "status": row['status'],
                                "update": row['update']
                            }
                        })
                else:
                    mongoMq[pListKey].insert(row)
            for row in removes:
                mongoMq[pListKey].find_and_modify(
                    {
                        'hostname': row['hostname'],
                        'title': row['title']
                    }, {'$set': {
                        'status': 'disable'
                    }})
            sleep(10)
예제 #9
0
def crawl(urlInfo):
    uI = urlInfo
    execute = mgdb.execute_getbyid(urlInfo['execute_id'])
    if not execute: return False
    sql = "select * from task_piping where task_id=:task_id and type=:type and status=:status"
    pipingDark = db.fetchone(sql, {'task_id': execute['task_id'], 'type': 'darklink', 'status': 1})

    try:
        ##如果任务已结束,则返回
        #if execute['status'] == 2 or urlInfo['status'] == 2:
        #    return True

        logger.info("crawl:uid[%s]:tid[%s]:eid[%s]:method[%s]::%s" % (
            uI['id'], uI['task_id'], uI['execute_id'], uI['method'], uI['url']
        ))

        # 抓取页面,解析数据
        response = {}
        urlItems = []
        #proxy = {'url':'http://%s' % MIRROR_PROXY} if execute['task_type'] == 'mirror' else {}
        proxy = {}
        requestInfo = spiderRequest(urlInfo['url'], urlInfo['method'], urlInfo['request_headers'], proxy=proxy)

        # 请求错误,直接返回
        if requestInfo['error']:
            mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id'])
            return True

        # 304或其他状态码,直接返回
        if requestInfo['http_code'] != 200:
            mgdb.spiderurl_save(_formatResponse(requestInfo, execute, urlInfo), urlInfo['id'])
            return True

        # 正常请求
        responseHeaders = requestInfo['response_headers']
        contentTypeRaw = responseHeaders['Content-Type'] if 'Content-Type' in responseHeaders.keys() else None
        contentType = parseContentType(contentTypeRaw, default = 'text/html')
        fileType = mime2file(contentType)
        #logger.debug("Content-Type::::::::" + contentTypeRaw + "::::" + contentType)

        #保存响应信息
        fileInfo = download(requestInfo, urlInfo, execute, fileType)
        response = _formatResponse(requestInfo, execute, urlInfo, fileInfo)
        mgdb.spiderurl_save(response, urlInfo['id'])

        #非html页面,直接返回
        if fileType != 'html': return True

        #外部连接,不再进一步分析
        if urlInfo['url_type'] != 'self': return True

        # 如果是单页面镜像,不分析页面
        if execute['task_type'] == 'mirror_one': return True

        #正则解析页面
        urlItems = parse_reg(requestInfo)
        #检测暗链
        if pipingDark:
            result = parse_darklink(requestInfo['url'])
            # logger.info('parse_darklink::::%s::::' % (result))
            darklinks = _formatUrls(result, 1) if result else []
            urlItems = urlItems + darklinks

        '''
        浏览器解析部分
        '''
        #if execute['limit_js']:
        #    results = parse_browser(requestInfo)
        #    if results: urlItems = urlItems + results

        # logger.info('parse_darklink::::%s::::%s' % ('urls_uniq', json.dumps(urlItems)))
        # url去重
        urlItems = _urls_uniq(urlItems)
        # 追加新的URL
        undos = []
        mirrors = []
        queueOut = []
        outlinks = []
        queueSite = []
        # logger.info('parse_darklink::::%s::::' % (urlItems))
        # logger.info('parse_darklink::::%s::::%s' % ('urlItems', json.dumps(urlItems)))
        for row in urlItems:
            url = row['url'].strip()
            if not isUrl(url): continue

            fileExtension = extension(url)

            urlType = _getDomainType(url, execute['domain'])
            # isExists = _checkUrlExists(execute['id'], url, row['method'])
            isExists = _checkUrlExists(execute['id'], url, row['method'], row['invisible'])
            if isExists: continue

            flagOutlink = 0
            item = {}
            item['site_id'] = execute['site_id']
            item['task_id'] = execute['task_id']
            item['app_id'] = execute['app_id']
            item['execute_id'] = execute['id']
            item['task_type'] = execute['task_type']
            item['url'] = url
            item['url_type'] = urlType
            item['file_extension'] = fileExtension
            item['method'] = row['method']
            item['invisible'] = row['invisible']
            item['post'] = json.dumps(row['post'], ensure_ascii=False) if row['post'] else ''

            # 非本站链接或不分析暗链,状态标为5,即不需要抓取

            item['status'] = 5
            if urlType == 'self':
                item['status'] = 0
            else:
                if fileExtension in staticExts:
                    item['status'] = 0
                else:
                    if pipingDark: 
                        flagOutlink = 1
                        item['status'] = 0
            if urlType == 'other': 
                outlinks.append(_formatOutlink(execute, urlInfo['url'], url, row['invisible']))
            item['referer'] = urlInfo['url']
            item['exec_level'] = execute['exec_level']
            item['depth'] = int(urlInfo['depth']) + 1
            item['query'] = row['query']
            item['pattern_path'] = row['pattern_path']
            item['pattern_query'] = row['pattern_query']
            item['create_at'] = now_format()
            item['update_at'] = now_format()
            if flagOutlink:
                queueOut.append(item)
            else:
                queueSite.append(item)

        # logger.info('22parse_darklink::::%s::::%s' % ('queueSite', json.dumps(queueSite)))
        # logger.info('22parse_darklink::::%s::::%s' % ('queueOut', json.dumps(queueOut)))
        if urlItems:
            mgdb.c_insert('parse', _formatParse(execute, urlInfo, urlItems, response['md5_body'], 'regular'))
        if outlinks: mgdb.c_insert_batch('outlink', outlinks)
        stats = Mq.get_stats_batch('spider', execute['id'])
        if queueSite:
            # logger.info('parse_darklink::::::::%s' % (queueSite))
            results = mgdb.c_insert_batch('spiderurl', queueSite)
            for item in results:
                # 状态位非0,不抓取
                if item['status'] != 0: continue
                # 深度超过限制,不抓取
                if item['depth'] > execute['limit_depth']: continue
                # 总数超过限制,不抓取
                if stats['total'] > execute['limit_total']: continue
                # 镜像,不抓取图片
                if execute['task_type'] == 'mirror' and item['file_extension'] in staticExts: continue
                # 单页面监测,不抓取子页面
                if execute['task_type'] in ['monitor_one', 'mirror_one'] and item['file_extension'] not in staticExts: continue
                # 不抓取图片
                if not execute['limit_image'] and item['file_extension'] in staticExts: continue
                item[batchKey] = item['execute_id']
                item[mqidKey] = item['id']

                #数据放入待抓取队列
                undos.append(item)

                #数据放入镜像队列
                if execute['task_type'] == 'mirror': mirrors.append(item)
        if queueOut:
            # logger.info('parse_darklink::::::::%s' % (queueOut))
            results = mgdb.c_insert_batch('spiderurl', queueOut)
            for item in results: 
                item[batchKey] = item['execute_id']
                item[mqidKey] = item['id']
                undos.append(item)
        if undos: Mq.produce(undos, 'spider')
        if mirrors: Mq.produce(mirrors, 'mirror')

    except Exception as e:
        logger.exception(e)
        return False
예제 #10
0
def execute_init(eid):
    '''本函数允许重复执行'''
    execute = db.getbyid('task_execute', eid)
    if not execute: return False
    execute['create_at'] = formatTimestamp(
        execute['create_at']) if execute['create_at'] else ''
    execute['update_at'] = formatTimestamp(
        execute['update_at']) if execute['update_at'] else ''
    execute['start_at'] = formatTimestamp(
        execute['start_at']) if execute['start_at'] else ''
    execute['end_at'] = formatTimestamp(
        execute['end_at']) if execute['end_at'] else ''
    execute['status'] = 101

    mgExecute = mgdb.execute_getbyid(eid)
    if not mgExecute:
        execute_spider = deepcopy(execute)
        mgdb.c_insert('execute', execute_spider, autoid=False)

    startUrls = json.loads(execute['start_urls'])
    startUrlsLen = len(startUrls)
    urlCount = mongoSpider['spiderurl'].find({
        'execute_id': eid
    }, {
        '_id': 0
    }).count()
    if startUrlsLen > urlCount:
        urlRows = []
        for url in startUrls:
            urldata = {
                'site_id': execute['site_id'],
                'task_id': execute['task_id'],
                'app_id': execute['app_id'],
                'task_type': execute['task_type'],
                'execute_id': eid,
                'exec_level': execute['exec_level'],
                'url': url,
                'url_type': 'self',
                'method': 'get',
                'status': 0,
                'create_at': now_format(),
                'update_at': now_format(),
            }
            urlRows.append(urldata)
        mgdb.c_insert_batch('spiderurl', urlRows)
    undos = [
        i
        for i in mongoSpider['spiderurl'].find({'execute_id': eid}, {'_id': 0})
    ]
    undos_spider = []
    undos_mirror = []
    for undo in undos:
        undo[mqidKey] = undo['id']
        undo[batchKey] = undo['execute_id']
        undos_spider.append(undo)
        undos_mirror.append(undo)

    pre = 'mq_spider_'
    stages = ['undo', 'ready', 'doing', 'done']
    stats = {
        stage: mongoMq[pre + stage].find({
            batchKey: eid
        }).count()
        for stage in stages
    }
    total = stats['undo'] + stats['ready'] + stats['doing'] + stats['done']
    if startUrlsLen > total:
        #添加spider队列
        Mq.produce(undos_spider, 'spider')

        #添加mirror队列
        if execute['task_type'] == 'mirror': Mq.produce(undos_mirror, 'mirror')

    if not mgExecute: db.updatebyid('task_execute', {'status': 101}, eid)
    return True