예제 #1
0
def onlyInsertCap(queue):
    from DBUtils.PooledDB import PooledDB

    pool3 = PooledDB(creator=MySQLdb, mincached=1, maxcached=2,
                    host=EADHOST, port=3306, user="******",
                    passwd=EADPASSWD, db="dushu", use_unicode=True, charset='utf8')
    conn3 = pool3.connection()
    csor3 = conn3.cursor()

    # conn.set_character_set('utf8')
    csor3.execute('SET NAMES utf8')
    csor3.execute("SET CHARACTER SET utf8")
    csor3.execute("SET character_set_connection=utf8")


    while True:
        capObj = queue.get()
        # print i, capObj['source'] + capObj['idx']
        try:
            capId = insertCapWithCapObj(capObj, conn3, csor3)
            if not capId:
                continue
            uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))
            global donedegest
            donedegest.add(capObj['digest'])
        except Exception as e:
            print 'insertCap error, ', e
예제 #2
0
def crawlCapsWithBookObj(allowUpdate, bookId, bookObj):
    '''
    根据book对象处理章节,新增或更新,会判断库中是否已有某章节
    :param allowUpdate: 是否允许更新
    :param bookId: shuqi的id
    :param bookObj: 库中的书信息
    :return: 
    '''
    newChapNum = bookObj['chapterNum']

    global donedegest
    capObjList = getCapObjsByBookObj(allowUpdate, bookId, bookObj)
    if not capObjList:
        print 'no capObjList, sid: ', bookId
        return newChapNum
    for capObj in capObjList:
        capId = insertCapWithCapObj2(capObj)
        donedegest.add(capObj['digest'])

        if not capId:
            newChapNum = min(newChapNum, capObj['idx'] + 1)
            continue
        uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))
        # existsCaps = getExistsCaps(bookObj['id'])
    return newChapNum
def handlChapByBookObjChapObj(allowUpdate, bookObj, chapObj):
    chapContentUrl = chapObj['url']
    chapContent = getContentWithUA(chapContentUrl)
    chapContentObj = json.loads(chapContent)
    if not chapContentObj or not chapContentObj['content'] or len(chapContentObj['content']) < MinChapContentLength:
        myLogging.error('zid %s content too small skip, chapContentUrl %s', bookObj['id'], chapContentUrl)
        return 0

    chapObj.update(chapContentObj)
    chapObj['title'] = chapObj['name']
    chapObj['rawUrl'] = chapContentUrl
    chapObj['idx'] = int(chapObj['serialNumber'])
    del chapObj['serialNumber']
    chapObj['size'] = len(chapObj['content'])
    chapObj['bookId'] = bookObj['id']
    chapObj['source'] = bookObj['source']
    chapObj['bookUUID'] = bookObj['digest']
    digest = getCapDigest(bookObj, chapObj, chapObj['bookChapterId'])
    chapObj['digest'] = digest
    chapObj['content'] = textClean(chapObj['content'])
    capId = insertCapWithCapObj(chapObj, allowUpdate=allowUpdate)
    # aftInsertCap = time.time()
    # insertCap = insertCap + (aftInsertCap - befInsertCap)
    if not capId:
        myLogging.error('no chapId cid %s', chapObj['bookChapterId'])
        return 0
    uploadJson2Bucket(str(chapObj['id']) + '.json', json.dumps(chapObj))

    return chapObj['idx']
예제 #4
0
def fixNewLineByBookObjs(quanBenObjs):

    from parse.contentHelper import textClean
    for quanBenObj in quanBenObjs:
        bookId = quanBenObj['id']
        chapIds = getCapIdsByBookId(bookId)
        for chapId in chapIds:
            try:
                url = ossBaseUrl + str(chapId) + '.json'
                r = requests.get(url)

                obj = json.loads(r.text)

                if not obj or not obj.has_key('content'):
                    delCapById(chapId)
                    myLogging.info('chap id %s, has no oss obj, delete',
                                   chapId)
                    continue

                content = textClean(obj['content'])
                obj['content'] = content

                uploadJson2Bucket(str(chapId) + '.json', json.dumps(obj))
                myLogging.info('succ cid %s', chapId)
            except Exception as e:
                myLogging.error('chap id %s, with exception: %s', chapId,
                                traceback.format_exc())
예제 #5
0
def handleCapUpload(cap):
    cid = cap[0]
    capUrl = cap[2]
    bookId = cap[5]
    unclearContent = cap[4]
    capObj = dict()
    capObj['id'] = cap[0]
    capObj['title'] = cap[1]
    capObj['rawUrl'] = cap[2]
    capObj['source'] = cap[3]
    capObj['content'] = cap[4]
    capObj['bookId'] = cap[5]
    capObj['idx'] = cap[6]
    capObj['digest'] = cap[7]
    capObj['size'] = cap[8]
    capObj['bookUUID'] = cap[9]
    content = unclearContent
    if unclearContent and not (u'        言情小说_打造最新原创' in unclearContent
                               or unclearContent == 'None'):
        uploadJson2Bucket(str(cid) + '.json', json.dumps(capObj))
    else:
        try:
            if not capUrl or len(capUrl) < 1:
                print cid, 'no url, bookId : ', bookId
            else:
                if 'shuqireader' in capUrl:
                    content = getContentByUrl(capUrl)
                    # updateContentById(cid, content)
                else:
                    content, host = getAndParse(capUrl)
                    if not content:
                        print cid, ' getAndparse content failed, bookId : ', bookId
                        # continue
                        # updateContentById(cid, content)
                        # cap[4] = content
            capObj['content'] = content

            upload2Bucket(str(cid) + '.json', json.dumps(capObj))
        except Exception as e:
            print 'cid ', cid, 'error: ', e
        except ValueError as er:
            print 'cid ', cid, 'error: ', er
def handlChapsByBookObjZidBocId(bookObj, zid, chapListObj, allowUpdate=False):
    # chapListObj = getChapsByBocId(bocId)
    resInx = 0  #保存最终更新到的下标
    # chapListObj = getChapObjs(bookObj)
    if not chapListObj:
        myLogging.error('zid %s get chaps list null', zid)
        return resInx
    if not chapListObj.has_key('chapters'):
        myLogging.error('zid %s chaps list no data', zid)
        return resInx
    capIdxs = set()
    capTitles = set()
    if allowUpdate:
        capIdxs = getCapIdxsByBookId(bookObj['id'])  # 已在库中的章节下标
        capTitles = getChapTitlesByBookId(bookObj['id'])  # 已在库中的章节下标
    for idx in range(0, len(chapListObj['chapters'])):
        try:
            # if idx in capIdxs:
            #     continue

            chapObj = chapListObj['chapters'][idx]

            if chapObj['title'] in capTitles:
                continue
            if idx in capIdxs:
                continue
            chapObj['cid'] = chapObj['link']
            if chapObj.has_key('id'):
                chapObj['cid'] = chapObj['id']
            chapObj['idx'] = idx

            chapContentUrl = ZSSQCHAPCONTENTBASEURL + quote(chapObj['link'])
            chapContentText = getContentWithUA(chapContentUrl)
            if not chapContentText:
                myLogging.error(
                    'zid: %s, dbid: %s, chapId: %s, get chapContent null ',
                    zid, bookObj['id'], chapObj['cid'])
                continue
            chapContentObj = json.loads(chapContentText)
            if not chapContentObj or not chapContentObj.has_key('chapter'):
                myLogging.error(
                    'zid: %5, dbid: %s, chapId: %s, get no chapter ', zid,
                    bookObj['id'], chapObj['cid'])
                continue
            if u'.' == chapContentObj['chapter']['title'] or len(
                    chapContentObj['chapter']['title']) < 2:
                del chapContentObj['chapter']['title']
            chapObj.update(chapContentObj['chapter'])

            chapObj['content'] = chapObj['body']
            if chapObj.has_key('cpContent'):
                chapObj['content'] = chapObj['cpContent']
                del chapObj['cpContent']
            chapObj['content'] = textClean(chapObj['content'])

            if len(chapObj['content']) < MinChapContentLength:
                myLogging.error('zid %s cid %s content too small skip', zid,
                                chapObj['cid'])
                continue

            del chapObj['body']
            del chapObj['link']
            chapObj['rawUrl'] = chapContentUrl
            # capObj['size'] = int(WordsCount)
            chapObj['size'] = len(chapObj['content'])
            chapObj['bookId'] = bookObj['id']
            chapObj['source'] = bookObj['source']
            chapObj['bookUUID'] = bookObj['digest']

            digest = getCapDigest(bookObj, chapObj, chapObj['cid'])
            chapObj['digest'] = digest

            capId = insertCapWithCapObj(chapObj)

            # aftInsertCap = time.time()
            # insertCap = insertCap + (aftInsertCap - befInsertCap)

            if not capId:
                continue
            uploadJson2Bucket(str(capId) + '.json', json.dumps(chapObj))

            resInx = max(resInx, idx)
            # aftUploadCap = time.time()
            # uploadCap = uploadCap + (aftUploadCap - aftInsertCap)
        except Exception as e:
            myLogging.error('zid: %, dbid: %s, idx: %s, get exception ', zid,
                            bookObj['id'], idx)
            myLogging.error(traceback.format_exc())
    return resInx
def handleCapsByBookObj(allowUpdate, bookObj, count, mid, startCapIdx = 1):
    capIdxs = set()
    if allowUpdate:
        capIdxs = getCapIdxsByBookId(bookObj['id'])  # 已在库中的章节下标

    # myBookId = bookObj['id']
    #
    # startCap = time.time()
    crawlParseSpent = 0
    insertCap = 0
    uploadCap = 0
    succCapTimes = 1
    resIdx = startCapIdx
    for cid in range(0, count + 1):
        try:

            if allowUpdate:
                if cid in capIdxs:
                    continue  # 该章节已在库中,跳过
                # else:
                #     startCap = time.time()

            befCrawl = time.time()
            succCapTimes = succCapTimes + 1

            # capContentUrl = MianFeiContentBaseUrl + str(cid) + '&contentid=' + str(mid)
            capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(bookObj['source']).mChapId(
                cid).mianfeiTXTSign().toUrl()

            capContent = getContentWithUA(capContentUrl, ua)
            if not capContent:
                capContent = getContentWithUA(capContentUrl, ua)
            # capContent = capContent.replace(r'\r', '').replace(r'\n', '')
            capListJsonObj = json.loads(capContent, strict=False)
            if not (capListJsonObj['returnCode'] == '0000'):
                capListJsonObj = json.loads(capContent)
                if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'):
                    resIdx = min(cid, resIdx)
                    myLogging.info('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid))
                    return resIdx  # 原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出

            capObj = dict()
            orgContent = capListJsonObj['data']['bookChapter']['content']
            contentSoup = getSoupByStr(orgContent)
            if not contentSoup or '' == orgContent or len(orgContent) < 1:
                myLogging.error('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid))
                resIdx = min(cid, resIdx)
                return resIdx #原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出

            if contentSoup.body['style']:
                del contentSoup.body['style']
            content = unicode(contentSoup.body).replace(u'<body>', '').replace(u'</body>', '').replace(u'\n\n',
                                                                                                       u'\n').replace(
                u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>')
            capObj['content'] = textClean(content)
            capObj['title'] = unicode(contentSoup.title.get_text())
            capObj['rawUrl'] = capContentUrl
            # capObj['size'] = int(WordsCount)
            capObj['size'] = len(content)
            capObj['bookId'] = bookObj['id']
            capObj['source'] = bookObj['source']
            capObj['idx'] = cid
            capObj['bookUUID'] = bookObj['digest']

            digest = getCapDigest(bookObj, capObj, cid)

            capObj['digest'] = digest

            befInsertCap = time.time()
            crawlParseSpent = crawlParseSpent + (befInsertCap - befCrawl)

            capId = insertCapWithCapObj(capObj)

            aftInsertCap = time.time()
            insertCap = insertCap + (aftInsertCap - befInsertCap)

            if not capId:
                continue
            uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))

            aftUploadCap = time.time()
            uploadCap = uploadCap + (aftUploadCap - aftInsertCap)
            resIdx = max(cid, resIdx)
        except Exception as e:
            myLogging.error('crawl' + str(mid) + ' cap ' + str(cid) + ' exception: ' + str(e))
            resIdx = min(cid, resIdx)
    if succCapTimes > 1:
        succCapTimes = succCapTimes - 1
    myLogging.info( 'crawlParse avg: ' + str(float(crawlParseSpent) / float(succCapTimes)) + \
        ' insert avg: ' + str(float(insertCap) / float(succCapTimes)) + \
        ' upload avg: ' + str(float(uploadCap) / float(succCapTimes)))
    return resIdx