Пример #1
0
def _formatResponse(requestInfo, execute, urlInfo, fileInfo=('','')):
    '''格式化UrlItem'''
    try:
        response = {}
        response['nettime'] =  requestInfo['nettime']
        if requestInfo['error'] or requestInfo['http_code'] != 200:
            response['status'] =  3
            response['http_code'] = requestInfo['http_code']
            response['error'] = repr(requestInfo['error'])
            response['end_at'] =  now_format()
            response['depth'] =  urlInfo['depth']
        else:
            response['status'] =  2
            response['end_at'] =  now_format()
            response['depth'] =  urlInfo['depth']
            response['md5_url'] =  md5(urlInfo['url'])
            response['md5_body'] =  md5(requestInfo['body'])
            response['redirects'] =  json.dumps(requestInfo['redirects'], ensure_ascii=False)
            response['http_code'] =  requestInfo['http_code']
            response['request_headers'] =  json.dumps(requestInfo['request_headers'], ensure_ascii=False)
            response['response_headers'] =  json.dumps(requestInfo['response_headers'], ensure_ascii=False)
            response['file_name'] = fileInfo[0]
            response['file_path'] = fileInfo[1]
        return response
    except Exception as e:
        logger.exception(e)
        return False
Пример #2
0
def mg_spiderjsurl_save(record, urlid=None):
    if 'id' in record.keys():
        if not urlid:
            urlid = record['id']
            del (record['id'])
        else:
            del (record['id'])
    if not urlid and 'url' not in record.keys(): return False
    #
    if not urlid and 'method' not in record.keys():
        record['method'] = 'GET'
    if 'url' in record.keys():
        record['md5_url'] = md5(record['url'])
    if 'body' in record.keys():
        record['md5_body'] = md5(record['body'])
    #
    dbFields = [
        'id', 'url', 'md5_url', 'referer', 'method', 'status', 'start_at',
        'end_at', 'create_at'
    ]
    mongoFields = [
        'id', 'url', 'md5_url', 'referer', 'method', 'http_code,',
        'request_headers,', 'response_headers', 'redirects', 'body',
        'md5_body', 'parse_result', 'error', 'status', 'start_at', 'end_at',
        'create_at'
    ]
    insertRow = {}
    for field in dbFields:
        if field in record.keys():
            insertRow[field] = record[field]
    if insertRow:
        if urlid:
            db.updatebyid('spiderjs_url', insertRow, urlid)
        else:
            urlid = db.insert('spiderjs_url', insertRow)
    jsRow = db.fetchone("select * from spiderjs_url where id=:id limit 1",
                        {'id': urlid})
    mongoRow = mgdb.c_getbyid('spiderjsurl', urlid)
    mongoRow = mongoRow if mongoRow else {}

    for field in mongoFields:
        value = ''
        if field in mongoRow.keys():
            value = mongoRow[field]
        if field in record.keys():
            value = record[field]
        if field in jsRow.keys():
            value = jsRow[field]
        mongoRow[field] = value

    mgdb.spiderjsurl_save(mongoRow)
    return urlid
Пример #3
0
def crawljs(taskInfo):
    try:
        #已抓取过,不再抓取
        if taskInfo['status'] not in (0, 1):
            return True
        #抓取页面源代码
        requestInfo = spiderRequest(taskInfo['url'])

        parseResults = []
        results = _parseForJs(taskInfo['url'])
        for record in results:
            urlRow = _parseForUrl(record)
            if not urlRow: continue
            parseResults.append(urlRow)
        updateRow = {}
        updateRow['id'] = taskInfo['id']
        updateRow['http_code'] = requestInfo['http_code']
        updateRow['response_headers'] = json.dumps(
            requestInfo['response_headers'], ensure_ascii=False)
        updateRow['body'] = requestInfo['body']
        updateRow['md5_body'] = md5(requestInfo['body'])
        updateRow['parse_result'] = json.dumps(parseResults,
                                               ensure_ascii=False)
        updateRow['status'] = 2
        updateRow['end_at'] = getTime('%Y-%m-%d %H:%M:%S')
        #保存数据结果
        mg_spiderjsurl_save(updateRow)
    except Exception as e:
        logger.exception(e)
        return False
Пример #4
0
def _urls_uniq(urlItems):
    '''对格式化后的URL去重'''
    urlsUniq = {}
    for i in urlItems:
        key = md5(i['url'] + i['method'] + str(i['invisible']))
        if key not in urlsUniq.keys(): urlsUniq[key] = i
    return list(urlsUniq.values())
Пример #5
0
def download(requestInfo, urlInfo, execute, fileType = 'html'):
    '''下载文件'''
    try:
        #如果有异常,直接返回
        md5Body = md5(requestInfo['body'])
        result = mgdb.static_get(execute['domain'], md5Body)
        if result: return (result['file_name'], result['file_key'])

        localfile = '%s/%s/%s.tmp' %  (PATH_TMP_UPLOAD, execute['domain'], md5Body)
        if not exists(dirname(localfile)):  mkdirs(dirname(localfile))
        if fileType == 'html':
            filename = "%s_%s.html" % (execute['id'], urlInfo['id'])
            filekey = 'html/%s/%s/%s_%s.html.%s' % (execute['domain'], execute['task_id'], execute['id'], urlInfo['id'], md5Body)
            fwriteBin(localfile, requestInfo['body'])
            fileType = 'html'
        else:
            filename = basename(requestInfo['url'])
            filekey = "static/%s/%s.%s" % (execute['domain'], requestInfo['url'][7:], md5Body)
            fwriteBin(localfile, requestInfo['body'])
            fileType = 'img'
        filepath = ydfs_upload(filekey, localfile)
        mgdb.c_insert('static', _formatStatic(execute['domain'], requestInfo['url'], filename, filekey, fileType, md5Body))
        return (filename, filekey)
    except Exception as e:
        logger.exception(e)
        return ('', '')
Пример #6
0
def _formatOutlink(execute, referer, url, md5Body='', invisible=0):
    '''格式化外链'''
    return {
        'task_id': execute['task_id'],
        'execute_id': execute['id'],
        'domain': execute['domain'],
        'referer': referer,
        'md5_referer': md5(referer),
        'url': url, 
        'md5_url': md5(url),
        'md5_body': md5Body, 
        'invisible': invisible, 
        'filterwords': '', 
        'date':getDate(), 
        'create_at': now_format(),
        'update_at': now_format()
    }
Пример #7
0
def getToken_key(key):
    appObj = db.fetchone('select * from app where unique_key=:key', {'key':key})
    if not appObj:
        return False

    tokenExpired = getTime("%Y-%m-%d %H:%M:%S" , (getTime() + 7200))
    token = md5(tokenExpired)
    db.updatebyid('app', {'token':token, 'token_expired':tokenExpired}, appObj['id'])
    return {'token':token, 'expired':tokenExpired}
Пример #8
0
def _checkUrlExists(executeid, url, method, invisible):
    '''检查url是否存在'''
    key = 'exists_%s' % executeid
    value = method + '-' + url + str(invisible)
    hkey = md5(value)
    if redis.hexists(key, hkey): return True

    redis.hset(key, hkey, value)
    redis.expire(key, 86400)
    return False
Пример #9
0
def _formatStatic(domain, url, filename, filekey, filetype, md5Body):
    return {
        'domain': domain,
        'url': url,
        'file_name': filename,
        'file_key': filekey,
        'file_type': filetype,
        'md5_url': md5(url),
        'md5_body': md5Body,
        'create_at': now_format(),
        'update_at': now_format(),
    }
Пример #10
0
def execCasper(content=None):
    try:
        filename = "%s/%s_%s" % (PATH_TMP_NODEJS, getTime('%Y%m%d'),
                                 md5(content))
        write(filename, content)
        cmd = 'casperjs ' + filename
        child = Popen(cmd,
                      shell=True,
                      close_fds=True,
                      bufsize=-1,
                      stdout=PIPE,
                      stderr=STDOUT)
        output = child.stdout.read().decode()
        #remove(filename)
        return output
    except Exception as e:
        logger.exception(e)
        return False