Exemplo n.º 1
0
def updateByBookObj(bookObj):
    source = int(bookObj['source'].replace('shuqi', ''))
    newBookObj, digest = getBookObjFromSQid(source)
    if not newBookObj:
        # delBookById(bookObj['id'])
        myLogging.error(
            'shuqi book has been droped, plz consider to delete id: ' +
            str(bookObj['id']) + ' sid: ' + str(source))
        return
    if newBookObj['chapterNum'] > bookObj['chapterNum']:
        newBookObj['id'] = bookObj['id']
        newChapNum = crawlCapsWithBookObj(bookObj=newBookObj,
                                          bookId=source,
                                          allowUpdate=True)

        if newChapNum >= bookObj['chapterNum']:
            updateOneFieldByOneField('chapterNum', newChapNum, 'id',
                                     bookObj['id'])
            updateBoostWithUpdateTime(bookObj['id'])
            myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' + str(newChapNum - bookObj['chapterNum'])\
                  + ' chaps ')

            if u'连载' != newBookObj['bookType']:
                updateOneFieldByOneField('bookType', newBookObj['bookType'],
                                         'id', bookObj['id'])
                myLogging.warning(newBookObj['title'].encode('utf-8') +
                                  newBookObj['bookType'].encode('utf-8'))
        else:
            myLogging.info(newBookObj['title'].encode('utf-8') +
                           ' has unexcepted, please check. didnot update ')
    else:
        myLogging.info(newBookObj['title'].encode('utf-8') + ' no update ()')
def handlChapByBookObjChapObj(allowUpdate, bookObj, chapObj):
    chapContentUrl = chapObj['url']
    chapContent = getContentWithUA(chapContentUrl)
    chapContentObj = json.loads(chapContent)
    if not chapContentObj or not chapContentObj['content'] or len(chapContentObj['content']) < MinChapContentLength:
        myLogging.error('zid %s content too small skip, chapContentUrl %s', bookObj['id'], chapContentUrl)
        return 0

    chapObj.update(chapContentObj)
    chapObj['title'] = chapObj['name']
    chapObj['rawUrl'] = chapContentUrl
    chapObj['idx'] = int(chapObj['serialNumber'])
    del chapObj['serialNumber']
    chapObj['size'] = len(chapObj['content'])
    chapObj['bookId'] = bookObj['id']
    chapObj['source'] = bookObj['source']
    chapObj['bookUUID'] = bookObj['digest']
    digest = getCapDigest(bookObj, chapObj, chapObj['bookChapterId'])
    chapObj['digest'] = digest
    chapObj['content'] = textClean(chapObj['content'])
    capId = insertCapWithCapObj(chapObj, allowUpdate=allowUpdate)
    # aftInsertCap = time.time()
    # insertCap = insertCap + (aftInsertCap - befInsertCap)
    if not capId:
        myLogging.error('no chapId cid %s', chapObj['bookChapterId'])
        return 0
    uploadJson2Bucket(str(chapObj['id']) + '.json', json.dumps(chapObj))

    return chapObj['idx']
Exemplo n.º 3
0
def fixNewLineByBookObjs(quanBenObjs):

    from parse.contentHelper import textClean
    for quanBenObj in quanBenObjs:
        bookId = quanBenObj['id']
        chapIds = getCapIdsByBookId(bookId)
        for chapId in chapIds:
            try:
                url = ossBaseUrl + str(chapId) + '.json'
                r = requests.get(url)

                obj = json.loads(r.text)

                if not obj or not obj.has_key('content'):
                    delCapById(chapId)
                    myLogging.info('chap id %s, has no oss obj, delete',
                                   chapId)
                    continue

                content = textClean(obj['content'])
                obj['content'] = content

                uploadJson2Bucket(str(chapId) + '.json', json.dumps(obj))
                myLogging.info('succ cid %s', chapId)
            except Exception as e:
                myLogging.error('chap id %s, with exception: %s', chapId,
                                traceback.format_exc())
def mianfeiUpdateByBookObj(bookObj, maxChapNum=0):
    mid = bookObj['source']
    newBookObj, newChapNum = crawlCurrentBookObj(mid)
    if not newBookObj:
        myLogging.error(
            'mid %s with dbId %s get None currentBookObj, plz check', mid,
            bookObj['id'])
        return
    latestCapIndex = newBookObj['latestCapIndex']
    newChapNum = max(newChapNum, latestCapIndex, maxChapNum)
    if newChapNum >= bookObj['chapterNum']:
        resIdx = handleCapsByBookObj(allowUpdate=True,
                                     bookObj=bookObj,
                                     count=newChapNum,
                                     mid=mid,
                                     startCapIdx=bookObj['chapterNum'])
        if resIdx > bookObj['chapterNum']:
            updateOneFieldByOneField('chapterNum', resIdx, 'id', bookObj['id'])
            updateBoostWithUpdateTime(bookObj['id'])
            myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' \
                  + str(resIdx - bookObj['chapterNum']) + ' chaps (mianTxt) ')
            if u'连载' != newBookObj['bookType']:
                updateOneFieldByOneField('bookType', newBookObj['bookType'],
                                         'id', bookObj['id'])
                myLogging.info(newBookObj['title'].encode('utf-8') +
                               newBookObj['bookType'])
    else:
        myLogging.info(newBookObj['title'].encode('utf-8') +
                       ' no update (mianTxt)')
Exemplo n.º 5
0
def updateFromMysql(st=10000, end=7000000):
    '''
        永远运行,从数据库中查询出于连载状态的小说,进行更新
    '''

    idx = st
    carry = 10000
    myLogging.info('start from %s to %s ', st, end)

    while idx < end:
        # seq = range(5000, 6000)
        seq = range(idx, idx + carry)

        random.shuffle(seq)
        #
        for sqBid in seq:
            # print sqBid
            # if sqBid in nullIdSet:
            #     continue
            if not srcIdBloom.contains('shuqi' + str(sqBid)):
                try:
                    num = start(sqBid, allowUpdate=False)
                    if num and num > 0:
                        srcIdBloom.add('shuqi' + str(sqBid))
                    # start(17043)
                except Exception as e:
                    myLogging.error('shuqi sid: %s , has exception %s',
                                    str(sqBid), traceback.format_exc())
                except IOError as e2:
                    myLogging.error('shuqi sid: %s , has exception %s',
                                    str(sqBid), traceback.format_exc())

        idx = idx + carry
Exemplo n.º 6
0
def updateByBookObj(bookObj):
    source = bookObj['source']
    [zid, zBocId] = source.split('/')
    currentChapsObj = getChapsByBocId(zBocId)
    if not currentChapsObj or not currentChapsObj.has_key('chapters') or len(
            currentChapsObj['chapters']) < 1:
        # delBookById(bookObj['id'])
        myLogging.error(
            'zssq  book maybe have been droped, plz consider to delete id: ' +
            str(bookObj['id']) + ' sid: ' + str(source))
        return
    currentChapNum = len(currentChapsObj['chapters'])
    if currentChapNum > bookObj['chapterNum']:

        newIdx = handlChapsByBookObjZidBocId(bookObj,
                                             zid,
                                             currentChapsObj,
                                             allowUpdate=True)

        if newIdx >= bookObj['chapterNum']:  #newIdx下标从1开始的
            updateOneFieldByOneField('chapterNum', newIdx + 1, 'id',
                                     bookObj['id'])
            updateBoostWithUpdateTime(bookObj['id'])
            myLogging.info('zid: %s, bookId: %s  update %s chaps ', zid,
                           bookObj['id'],
                           str(newIdx + 1 - bookObj['chapterNum']))

    else:
        myLogging.info('zid: %s, bookId: %s no update ()', zid, bookObj['id'])
def getSourceId(qid):
    srcUrl = srcListBaseUrl % str(qid)

    srcListContent = getContentWithUA(srcUrl)
    if not srcListContent:
        return
    srcJsonObj = json.loads(srcListContent)
    if not srcJsonObj or not srcJsonObj.has_key('items'):
        myLogging.error('no  srcObj items qid %s', qid)
        return

    srcItems = srcJsonObj['items']

    if len(srcItems.keys()) < 1:
        myLogging.error('  srcObj items len < 1 qid %s', qid)
        return

    if srcItems.has_key('api.zhuishuwang.com'):
        return srcItems['api.zhuishuwang.com'][0]['book_source_id']

    # updateTIme = 0
    # resId = ''
    # for itmkey in srcItems.keys():
    #     if srcItems[itmkey][0]['update_time'] > updateTIme:
    #         resId = srcItems[itmkey][0]['book_source_id']
    #         updateTIme = srcItems[itmkey][0]['update_time']
    #
    # return resId
    raise InputException('no zhuishuwang source, skip')
Exemplo n.º 8
0
def searchAndCrawl(searchInput, limit=5):

    searchResObj = search(searchInput)
    succcount = 0
    count = 0
    for bookObj in searchResObj['books']:
        count += 1
        if count > 5:  #只要搜索结果的前N个,后面的就算了
            break

        digest = getBookDigest(bookObj)
        if bookDigestBloom.contains(digest):
            myLogging.info('has book %s, with same author %s, skip',
                           bookObj['title'].encode('utf-8'),
                           bookObj['author'].encode('utf-8'))
            continue
        zid = bookObj['_id']
        try:
            startByZid(zid, allowUpdate=False)
        except Exception as e:
            myLogging.error('zid %s has exception: %s', zid,
                            traceback.format_exc())
        succcount += 1
        if succcount > limit:  #最多抓取图书数量
            break
Exemplo n.º 9
0
def qichachaFromIndustry(f, t):
    myLogging.info('start from %s to %s ', f, t)
    indBaseUrl = 'http://www.qichacha.com/gongsi_industry?industryCode='
    conn, csor = getComConnCsor()
    for code in range(f, t + 1):
        industCode = chr(code + 65)
        industOrder = code
        inductBasePageUrl = indBaseUrl + industCode + '&industryorder=' + str(
            industOrder)

        try:
            myLogging.info('start indust base pages, %s', inductBasePageUrl)
            # qichachaFromIndustPageUrl(inductBasePageUrl,conn, csor)
            myLogging.info('end indust base pages, %s', inductBasePageUrl)

            myLogging.info('start indust subIndust pages, %s',
                           inductBasePageUrl)
            pageContent = getQichachaHtml(inductBasePageUrl)
            pageSoup = getSoupByStrEncode(pageContent, 'utf-8')
            subUrlTags = pageSoup.select('.filter-tag')[1]
            if not subUrlTags:
                myLogging.error('no subUrls, skipped, %s', inductBasePageUrl)
            for tag in subUrlTags.select('a'):
                subUri = tag['href']
                subUrl = urlparse.urljoin(indBaseUrl, subUri)

                myLogging.info('start sub indust base pages, %s', subUrl)
                qichachaFromIndustPageUrl(subUrl, conn, csor)
                myLogging.info('end sub indust base pages, %s', subUrl)
        except Exception as e:
            myLogging.error('indust error, industCode: %s url: %s; error: %s ',
                            industCode, inductBasePageUrl, e)
Exemplo n.º 10
0
def mianfeiTxtUpdateFromMysql():
    bookObjs = getMianAllBookObjs()
    for bookObj in bookObjs:
        try:
            mianfeiUpdateByBookObj(bookObj, maxChapNum=0)

        except Exception as e:
            myLogging.error('mianTxt update book ' + str(bookObj['id']) +
                            ' raise exception ')
            myLogging.error(traceback.format_exc())
Exemplo n.º 11
0
def qichachaFromIndustPageUrl(url, conn, csor):
    baseUrl = url.replace('?', '_').replace('&', '_').replace('=', '_') + '_p_'

    for pageCount in range(1, 501):
        pageUrl = baseUrl + str(pageCount) + '.shtml'

        try:
            pageContent = getQichachaHtml(pageUrl)
            pageSoup = getSoupByStrEncode(pageContent, 'utf-8')
            dealUIDsBySoup(conn, csor, pageCount, pageSoup, 'indust')
        except Exception as e:
            myLogging.error('page error, url: %s', pageUrl)
Exemplo n.º 12
0
def updateFromMysql():
    '''
        永远运行,从数据库中查询出于连载状态的小说,进行更新
    '''

    bookObjs = getZssqAllLianZaiBookObjs()
    for bookObj in bookObjs:
        try:
            updateByBookObj(bookObj)
        except Exception as e:
            myLogging.error('update book' + str(bookObj['id']) +
                            ' raise exception ')
            myLogging.error(traceback.format_exc())
Exemplo n.º 13
0
def qichachaFromProvs(provs):
    myLogging.info('start: provs %s', str(provs))
    catBaseIrl = 'http://www.qichacha.com/gongsi_area_prov_'
    conn, csor = getComConnCsor()
    for prov in provs:
        pageBaseUrl = catBaseIrl + prov + '_p_'
        for pageCount in range(1, 501):
            pageUrl = pageBaseUrl + str(pageCount) + '.shtml'
            try:
                pageContent = getQichachaHtml(pageUrl)
                pageSoup = getSoupByStrEncode(pageContent, 'utf-8')
                dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov)
            except Exception as ee:
                myLogging.error('page ' + str(pageCount) + ' error %s', ee)
Exemplo n.º 14
0
def insertWithUid(conn2, csor2, prv, uid):

    if uid in idBloom:
        print 'already crawled uid:', uid
        return

    # idBloom.add(uid)

    global conn, csor
    if not conn or (not csor):
        conn2, csor2 = getComConnCsor()

    com_base_info_str = getBaseInfoById(prv, uid)
    com_base_info_json = json.loads(com_base_info_str)
    if com_base_info_json['status'] != 1:
        print 'json int not succ , uid: ', uid, ' content:', com_base_info_str
        return
    data = com_base_info_json['data']['Company']
    companyType = data['EconKind']
    # webName = data['webName']
    companyName = data['Name']
    liscense = data['No']
    if not liscense:
        liscense = data['OrgNo']
    examineDate = ''
    if data['CheckDate']:
        examineDate = data['CheckDate'].strip()
        # webSite = ','.join(data['webSite'])
        # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName)

    global staticInsertTotolCount, staticInsertTotolTime, staticInsertCarry
    startTime = time.time()

    try:
        csor2.execute(
            """insert ignore into com_base_copy (id,companyName,companyType,examineDate,liscense,source,src_content)
            values (%s,%s,%s,%s,%s,%s,%s);""",
            (uid, companyName, companyType, examineDate, liscense, "qichacha",
             com_base_info_str))
        conn2.commit()
        myLogging.info('comOk, uid: %s, comName: %s', uid,
                       unicode(companyName).encode('utf-8'))
        endTime = time.time()
        thisSpentTime = endTime - startTime

        statisMysqlInsert(staticInsertCarry, thisSpentTime)

    except Exception as e:
        myLogging.error('insert error, uid: %s, error:%s', uid, e)
def handleChapsByBookObj(bookObj, zid, allowUpdate=False):

    # zid = bookObj['source']

    bocObjs = getBocObjsByZid(zid)

    sourceCount = 0
    for bocIdx in range(0, len(bocObjs)):
        bocObj = bocObjs[bocIdx]
        bocId = bocObj['_id']

        try:

            bocSource = bocObj['source']
            if 'zhuishuvip' == bocSource:
                continue

            bookObj['source'] = zid + '/' + bocId
            bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str(
                zid) + "?source=" + str(bocId)
            chapListObj = getChapsByBocId(bocId)
            bookObj['chapterNum'] = min(bookObj['chapterNum'],
                                        len(chapListObj['chapters']))

            if bookObj['chapterNum'] <= MINCHAPNUM:
                continue

            bookObj = insertBookWithConn(bookObj, allowUpdate)

            resInx = handlChapsByBookObjZidBocId(bookObj, zid, chapListObj,
                                                 allowUpdate)
            if resInx <= MINCHAPNUM:
                myLogging.info(
                    'zid %s dbid %s crawl too small chapNum, delete ', zid,
                    bookObj['id'])
                delBookById(bookObj['id'])

            sourceCount += 1
            if sourceCount >= sourceLimit:
                myLogging.info('zid: %s crawl source to sourceLimit', zid)
                break
            else:
                # bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str(zid) + "?source=" + str(bocId)
                # bookObj = parseInsertBook(allowUpdate, bookObj, zid) #重新插入另外一个源的书
                myLogging.info('zid: %s crawl another source %s', zid, bocId)
        except Exception as e:
            myLogging.error('zid: %s ,bocId %s get exception ', zid, bocId)
            myLogging.error(traceback.format_exc())
Exemplo n.º 16
0
def insertCapWithCapObj(capObj, conn2=None, csor2=None, allowUpdate=False):
    if not conn2 or not csor2:
        conn2, csor2 = getDushuConnCsor()

    # sql = "insert ignore cn_dushu_acticle (title,rawUrl,source,content,bookId,idx,digest,size,bookUUID) values" \
    #       "('%s','%s','%s','%s',%d,%d,'%s', %d, '%s')" % (
    #           capObj['title'], capObj['rawUrl'], capObj['source'], capObj['content']
    #           , capObj['bookId'], capObj['idx'], capObj['digest'], capObj['size'], capObj['bookUUID'])
    try:
        csor2.execute("insert cn_dushu_acticle (bookId,idx,digest,bookUUID,title,size) values" \
          "(%s,%s,%s,%s,%s,%s)" , (capObj['bookId'], capObj['idx'], capObj['digest'], capObj['bookUUID'], capObj['title'], capObj['size']))
        # csor2.execute("update cn_dushu_acticle set title = %s, size= %s where digest = %s" , (capObj['title'], capObj['size'], capObj['digest'] ))
        conn2.commit()
        myLogging.info('scap, ' + ":" + str(capObj['idx']))
        # , ', content: ', capObj['content'][0:15]

    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error(e)
        if conn2:
            try:
                conn2.rollback()
            except Exception as ee:
                myLogging.error(ee)
        if not allowUpdate:
            return None
    try:
        csor2.execute(
            "select id,bookId from cn_dushu_acticle where digest = %s;",
            (capObj['digest'], ))
        conn2.commit()

        sqlObj = csor2.fetchone()
        capId = sqlObj[0]
        bookId = sqlObj[1]

        if bookId != capObj['bookId']:
            myLogging.info('update bookId' + str(capId))
            # 如果已存在,且bookId不对,更新下,防止错误cap占坑
            csor2.execute(
                "update cn_dushu_acticle set bookId = %s where id = %s;",
                (capObj['bookId'], capId))
            conn2.commit()

        capObj['id'] = capId
        return capId
    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error(e)
        if conn2:
            try:
                conn2.rollback()
            except Exception as ee:
                myLogging.error(ee)
        return None

    csor2.close()
    conn2.close()
Exemplo n.º 17
0
def delCapById(cid):
    conn2, csor2 = getDushuConnCsor()

    try:
        csor2.execute("delete from " + db_acticle + " where id = %s", (cid, ))
        conn2.commit()
    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error('mysql ex: ' + str(e))
        if conn2:
            try:
                conn2.rollback()
            except Exception as ee:
                myLogging.error('rollback error : ' + str(cid))

    csor2.close()
    conn2.close()
Exemplo n.º 18
0
def fromTopNBookIds():
    fbids = open('top5wBookIds.txt')
    bidsSet = set()
    while 1:
        bid = fbids.readline()
        if not bid:
            break
        bid = bid.replace('\n', '')
        bidsSet.add(bid)
    print 'load to dict uniq ' + str(len(bidsSet))
    for bid in bidsSet:
        try:
            QuanBenCrawler(bid).crawl(allowUpdate=False)

        except Exception as e:
            myLogging.error(
                'bookId %s has exception: ' + traceback.format_exc(), bid)
Exemplo n.º 19
0
def insertBookWithConn(bookObj, allowUpdate=True, conn2=None, csor2=None):

    if not conn2 or not csor2:
        conn2, csor2 = getDushuConnCsor()

    userId = random.randint(1, 50)

    updateTime = int(time.time())

    digest = getBookDigest(bookObj)
    bookObj['digest'] = digest

    #统一清理操作
    bookObj['subtitle'] = subTitleClean(bookObj['subtitle'])

    if not bookObj.has_key('source'):
        bookObj['source'] = ''

    try:
        csor2.execute('insert  ' + db_dushu +
          '(categoryCode,typeCode,category,type,userId,title,subtitle,imgUrl,author,updateTime' \
          ",rawUrl,source,digest,status,viewNum, chapterNum, bookType, size) values" \
          "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)" \
          , (bookObj['categoryCode'],bookObj['typeCode'], bookObj['category'], bookObj['type'], userId,bookObj['title']
             ,bookObj['subtitle'],bookObj['imgUrl'],bookObj['author'],updateTime, bookObj['rawUrl']
             ,bookObj['source'],digest, 11,bookObj['viewNum'],bookObj['chapterNum'],bookObj['bookType'],bookObj['size']))
        # csorDoc.execute('update cn_dushu_book set subtitle = %s where digest = %s'
        #   , (bookObj['subtitle'],digest))
        conn2.commit()
        myLogging.info('succ book, ' +
                       unicode(bookObj['title']).encode('utf-8'))
    except Exception, e:
        #     # 发生错误时回滚
        myLogging.warning('update rollback; maybe exists, err:  %s',
                          traceback.format_exc())
        if conn2:
            try:
                conn2.rollback()
            except Exception as ee:
                myLogging.error('rollback error : ' + bookObj['rawUrl'])

        if u'完结' == bookObj['bookType']:
            updateBookTypeByRawUrl(bookObj['bookType'], bookObj['rawUrl'])
            # return None #有bug
        if not allowUpdate:
            return None
def startByZid(zid, allowUpdate=False):

    bookObj = getBookObjBiZid(zid)
    if not bookObj:
        myLogging.error('zid %s get bookObj null', zid)
        return

    bookObj = parseBook(allowUpdate, bookObj, zid)

    # bocObjs = getBocObjsByZid(zid)

    # bookObj['source'] = zid + '/' + bocId
    # bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str(zid)

    # bookObj = insertBookWithConn(bookObj, allowUpdate)
    if not bookObj:
        myLogging.error('zid %s parse and insert bookObj null', zid)
        return

    handleChapsByBookObj(bookObj, zid, allowUpdate)
Exemplo n.º 21
0
def delBookById(bookId):
    conn2, csor2 = getDushuConnCsor()

    #将删除的书记录到deleted_ids表中
    csor2.execute('select rawUrl,source from ' + db_dushu + ' where id = %s',
                  (bookId, ))
    conn2.commit()
    bookObj = csor2.fetchone()
    if not bookObj:
        return
    rawUrl = bookObj[0]
    source = bookObj[1].replace('shuqi', '')
    if 'shuqireader' in rawUrl:
        csor2.execute(
            'insert into shuqi_deleted_ids (id, sid) VALUEs (%s, %s)',
            (bookId, source))
        conn2.commit()
    elif 'yingyangcan' in rawUrl:
        csor2.execute(
            'insert into mianfei_deleted_ids (id, mid) VALUEs (%s, %s)',
            (bookId, source))
        conn2.commit()

    bookId = int(bookId)
    # sql = "delete from " + db_dushu + " where id = %d" % bookId
    try:
        csor2.execute("delete from " + db_dushu + " where id = %s", (bookId, ))
        conn2.commit()
    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error(e)
        if conn2:
            try:
                conn2.rollback()
            except Exception as ee:
                myLogging.error(e)

    deleteChapsLargerThanIdx(bookId, -1)  #删除所有章节

    csor2.close()
    conn2.close()
Exemplo n.º 22
0
def updateContentById(id, content):

    conn, csor = getDushuConnCsor()

    # sql = "update cn_dushu_acticle set content = %s where id = %s " % (content, str(id))
    try:
        csor.execute("update cn_dushu_acticle set content = %s where id = %s ",
                     (content, id))
        conn.commit()
        myLogging.info(str(id) + ' succ cap, ' + content[0:15])
    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error(e)
        if conn:
            try:
                conn.rollback()
            except Exception as ee:
                myLogging.error(ee)

    csor.close()
    conn.close()
Exemplo n.º 23
0
def searchAndCrawlByName(comName, proxy=None):
    if not comName:
        return None
    comName = comName.encode('utf-8')
    # baseUrl = 'http://www.qichacha.com/search?key=' + quote(comName)
    # baseUrl = 'http://www.qichacha.com/firm_CN_ea3a783f0c010fc31a2d75c2c9aa9b75'
    baseUrl = 'http://www.qichacha.com/search?key=%E5%B0%8F%E7%B1%B3'
    ua = random.choice(USER_AGENTS)
    htmlContent = getQichachaHtml(baseUrl, noCookie=True)
    if not htmlContent:
        return None
    soup = getSoupByStrEncode(htmlContent)
    if not soup.select('ul.list-group a') or len(
            soup.select('ul.list-group a')) < 1:
        myLogging.debug(htmlContent)
        return None
    for uidTag in soup.select('ul.list-group a'):
        uid = uidTag['href'].replace('firm_', '')
        if uid == uidTag['href']:
            myLogging.warning('not uid, skip %s', uidTag['href'])
            continue

        uid = uid.replace('.shtml', '').replace('/', '')

        prv = None
        if '_' in uid:
            strs = uid.split('_')
            prv = strs[0]
            uid = strs[1]
        # comName = uidTag.select_one('.text-lg').get_text()
        # comObj = dict()
        # comObj['uid'] = uid
        # comObj['comName'] = comName

        try:
            insertWithUid(conn, csor, prv, uid)
        except Exception as e:
            myLogging.error('insert with uid fail, uid: %s', uid)
        # print comLink
    return 'ok'
Exemplo n.º 24
0
def dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov):
    uidList = pageSoup.select('.list-group-item')
    if len(uidList) < 1:
        myLogging.error('no com list, skip %s page: %s', prov, pageCount)
        return
        # continue
    for uidTag in uidList:
        try:
            if not uidTag.has_attr('href'):
                myLogging.error('no com Tag, skip %s page: %s; tag: %s', prov,
                                pageCount, uidTag)
                # continue
                return
            prv = None
            uid = uidTag['href'].replace('firm_',
                                         '').replace('.shtml',
                                                     '').replace('/', '')
            if '_' in uid:
                strs = uid.split('_')
                prv = strs[0]
                uid = strs[1]
            if uid in idBloom:
                myLogging.info('already crawled, skip uid: %s', uid)
                continue
            insertWithUid(conn, csor, prv, uid)
        except Exception as ee:
            myLogging.error('uid: %s error: %s', uid, ee)
Exemplo n.º 25
0
def getExistsCapsRawUrlId(bookId):

    conn, csor = getDushuConnCsor()

    checkCapsSql = 'select id,rawUrl from cn_dushu_acticle where bookId = %d' % (
        bookId)
    try:
        csor.execute(checkCapsSql)
        conn.commit()
        results = csor.fetchall()

        if not results or len(results) < 1:
            myLogging.warning('no caps,, bookId:' + str(bookId))
            return None
        else:
            return results
    except Exception as e:
        #     # 发生错误时回滚
        myLogging.error(e)

    csor.close()
    conn.close()
Exemplo n.º 26
0
def updateFromMysql():
    '''
        永远运行,从数据库中查询出于连载状态的小说,进行更新
    '''

    # crawlInput = dict()
    # crawlInput['crawlerName'] = 'shuqiById'
    # data = dict()
    # crawlInput['data']  = data
    #
    # conn,csor = getDushuConnCsor()
    #
    # csor.execute("SELECT source from cn_dushu_book where operateStatus = 0  AND bookType = '连载' and rawUrl like 'http://api.shuqireader.com/reader/bc_cover.php%';")
    # conn.commit()
    #
    # ss = csor.fetchall()
    # for source in ss:
    #     sid = source[0].replace('shuqi', '')
    #     if '' == sid:
    #         continue
    #     data['sid'] = sid
    #
    #
    #     try:
    #         r = requests.post('http://0.0.0.0:10008/simpleCrawler', data = json.dumps(crawlInput))
    #     # print 'dine id: ',sid, 'with response: ',
    #     except Exception as e:
    #         print 'sid: ',sid, ' done with exception: ', e.message
    bookObjs = getShuqiAllLianZaiBookObjs()
    for bookObj in bookObjs:
        try:
            updateByBookObj(bookObj)
        except Exception as e:
            myLogging.error('update book' + str(bookObj['id']) +
                            ' raise exception ')
            myLogging.error(traceback.format_exc())
Exemplo n.º 27
0
def changeSouceIds():
    bookObjs = getMianAllBookBaseObjs()
    for bookObj in bookObjs:
        try:
            foundNewId = False
            title = bookObj['title']
            author = bookObj['author']
            source = bookObj['source']
            bookId = bookObj['id']

            searchUrl = MianFeiTXTSearchBaseUrl + '?' + paramMap().mianfeiTXT()\
                .put('keyword', (title + author).encode('utf-8'))\
                .put('pageSize', '10').put('pageNum', '1').put('type', '1')\
                .mianfeiTXTSign() \
                .toUrl()

            # time.sleep(random.)
            r = requests.get(searchUrl)

            searchRes = json.loads(r.text)
            for resBook in searchRes['data']['books']:
                resTitle = resBook['name']
                if resTitle != title:
                    continue
                resAuthor = resBook['author']
                if resAuthor != author:
                    continue

                resId = resBook['id']

                if str(resId) == str(source):
                    myLogging.info('WTF: id no change?, bookId: %s, orgSoueceId: %s,  newId: %s', bookId, source, resId)

                latestChapObj = getLatestChapByBookId(bookId)
                if not latestChapObj:
                    myLogging.error('no chaps in db yet, bookId: %s, new mid: %s', bookId, resId)
                    updateOneFieldByOneField('source', resId, 'id', bookId)
                    foundNewId = True
                    break

                cid = latestChapObj['idx']
                chapTitle = latestChapObj['title']

                capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(resId).mChapId(
                    cid).mianfeiTXTSign().toUrl()

                capContent = getContentWithUA(capContentUrl)
                if not capContent:
                    capContent = getContentWithUA(capContentUrl)
                # capContent = capContent.replace(r'\r', '').replace(r'\n', '')
                capListJsonObj = json.loads(capContent, strict=False)
                if not (capListJsonObj['returnCode'] == '0000'):
                    capListJsonObj = json.loads(capContent)
                    if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'):
                        myLogging.error('get chap detail fail mid: %s, cid: %s', resId, cid)
                        continue

                chapterName = capListJsonObj['data']['bookChapter']['chapterName']
                if chapterName == chapTitle:
                    myLogging.info('bookId %s change source  from %s to %s', bookId, source, resId)
                    updateOneFieldByOneField('source', resId, 'id', bookId)
                    foundNewId = True
                    break
            if not foundNewId:
                myLogging.error('bookId %s did not find new id !!!,title: %s, author: %s, org source: %s', bookId, title, author,source )
        except Exception as e:
            myLogging.error(traceback.format_exc())
def handlChapsByBookObjZidBocId(bookObj, zid, chapListObj, allowUpdate=False):
    # chapListObj = getChapsByBocId(bocId)
    resInx = 0  #保存最终更新到的下标
    # chapListObj = getChapObjs(bookObj)
    if not chapListObj:
        myLogging.error('zid %s get chaps list null', zid)
        return resInx
    if not chapListObj.has_key('chapters'):
        myLogging.error('zid %s chaps list no data', zid)
        return resInx
    capIdxs = set()
    capTitles = set()
    if allowUpdate:
        capIdxs = getCapIdxsByBookId(bookObj['id'])  # 已在库中的章节下标
        capTitles = getChapTitlesByBookId(bookObj['id'])  # 已在库中的章节下标
    for idx in range(0, len(chapListObj['chapters'])):
        try:
            # if idx in capIdxs:
            #     continue

            chapObj = chapListObj['chapters'][idx]

            if chapObj['title'] in capTitles:
                continue
            if idx in capIdxs:
                continue
            chapObj['cid'] = chapObj['link']
            if chapObj.has_key('id'):
                chapObj['cid'] = chapObj['id']
            chapObj['idx'] = idx

            chapContentUrl = ZSSQCHAPCONTENTBASEURL + quote(chapObj['link'])
            chapContentText = getContentWithUA(chapContentUrl)
            if not chapContentText:
                myLogging.error(
                    'zid: %s, dbid: %s, chapId: %s, get chapContent null ',
                    zid, bookObj['id'], chapObj['cid'])
                continue
            chapContentObj = json.loads(chapContentText)
            if not chapContentObj or not chapContentObj.has_key('chapter'):
                myLogging.error(
                    'zid: %5, dbid: %s, chapId: %s, get no chapter ', zid,
                    bookObj['id'], chapObj['cid'])
                continue
            if u'.' == chapContentObj['chapter']['title'] or len(
                    chapContentObj['chapter']['title']) < 2:
                del chapContentObj['chapter']['title']
            chapObj.update(chapContentObj['chapter'])

            chapObj['content'] = chapObj['body']
            if chapObj.has_key('cpContent'):
                chapObj['content'] = chapObj['cpContent']
                del chapObj['cpContent']
            chapObj['content'] = textClean(chapObj['content'])

            if len(chapObj['content']) < MinChapContentLength:
                myLogging.error('zid %s cid %s content too small skip', zid,
                                chapObj['cid'])
                continue

            del chapObj['body']
            del chapObj['link']
            chapObj['rawUrl'] = chapContentUrl
            # capObj['size'] = int(WordsCount)
            chapObj['size'] = len(chapObj['content'])
            chapObj['bookId'] = bookObj['id']
            chapObj['source'] = bookObj['source']
            chapObj['bookUUID'] = bookObj['digest']

            digest = getCapDigest(bookObj, chapObj, chapObj['cid'])
            chapObj['digest'] = digest

            capId = insertCapWithCapObj(chapObj)

            # aftInsertCap = time.time()
            # insertCap = insertCap + (aftInsertCap - befInsertCap)

            if not capId:
                continue
            uploadJson2Bucket(str(capId) + '.json', json.dumps(chapObj))

            resInx = max(resInx, idx)
            # aftUploadCap = time.time()
            # uploadCap = uploadCap + (aftUploadCap - aftInsertCap)
        except Exception as e:
            myLogging.error('zid: %, dbid: %s, idx: %s, get exception ', zid,
                            bookObj['id'], idx)
            myLogging.error(traceback.format_exc())
    return resInx
def handleCapsByBookObj(allowUpdate, bookObj, count, mid, startCapIdx = 1):
    capIdxs = set()
    if allowUpdate:
        capIdxs = getCapIdxsByBookId(bookObj['id'])  # 已在库中的章节下标

    # myBookId = bookObj['id']
    #
    # startCap = time.time()
    crawlParseSpent = 0
    insertCap = 0
    uploadCap = 0
    succCapTimes = 1
    resIdx = startCapIdx
    for cid in range(0, count + 1):
        try:

            if allowUpdate:
                if cid in capIdxs:
                    continue  # 该章节已在库中,跳过
                # else:
                #     startCap = time.time()

            befCrawl = time.time()
            succCapTimes = succCapTimes + 1

            # capContentUrl = MianFeiContentBaseUrl + str(cid) + '&contentid=' + str(mid)
            capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(bookObj['source']).mChapId(
                cid).mianfeiTXTSign().toUrl()

            capContent = getContentWithUA(capContentUrl, ua)
            if not capContent:
                capContent = getContentWithUA(capContentUrl, ua)
            # capContent = capContent.replace(r'\r', '').replace(r'\n', '')
            capListJsonObj = json.loads(capContent, strict=False)
            if not (capListJsonObj['returnCode'] == '0000'):
                capListJsonObj = json.loads(capContent)
                if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'):
                    resIdx = min(cid, resIdx)
                    myLogging.info('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid))
                    return resIdx  # 原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出

            capObj = dict()
            orgContent = capListJsonObj['data']['bookChapter']['content']
            contentSoup = getSoupByStr(orgContent)
            if not contentSoup or '' == orgContent or len(orgContent) < 1:
                myLogging.error('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid))
                resIdx = min(cid, resIdx)
                return resIdx #原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出

            if contentSoup.body['style']:
                del contentSoup.body['style']
            content = unicode(contentSoup.body).replace(u'<body>', '').replace(u'</body>', '').replace(u'\n\n',
                                                                                                       u'\n').replace(
                u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>')
            capObj['content'] = textClean(content)
            capObj['title'] = unicode(contentSoup.title.get_text())
            capObj['rawUrl'] = capContentUrl
            # capObj['size'] = int(WordsCount)
            capObj['size'] = len(content)
            capObj['bookId'] = bookObj['id']
            capObj['source'] = bookObj['source']
            capObj['idx'] = cid
            capObj['bookUUID'] = bookObj['digest']

            digest = getCapDigest(bookObj, capObj, cid)

            capObj['digest'] = digest

            befInsertCap = time.time()
            crawlParseSpent = crawlParseSpent + (befInsertCap - befCrawl)

            capId = insertCapWithCapObj(capObj)

            aftInsertCap = time.time()
            insertCap = insertCap + (aftInsertCap - befInsertCap)

            if not capId:
                continue
            uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj))

            aftUploadCap = time.time()
            uploadCap = uploadCap + (aftUploadCap - aftInsertCap)
            resIdx = max(cid, resIdx)
        except Exception as e:
            myLogging.error('crawl' + str(mid) + ' cap ' + str(cid) + ' exception: ' + str(e))
            resIdx = min(cid, resIdx)
    if succCapTimes > 1:
        succCapTimes = succCapTimes - 1
    myLogging.info( 'crawlParse avg: ' + str(float(crawlParseSpent) / float(succCapTimes)) + \
        ' insert avg: ' + str(float(insertCap) / float(succCapTimes)) + \
        ' upload avg: ' + str(float(uploadCap) / float(succCapTimes)))
    return resIdx
#!/usr/bin/python
# -*- coding: UTF-8 -*-
'''

@author: zyq
'''
import time
import traceback

from app.SearchHistoryCrawler import crawlByDailySearchHistory
from app.shuqiUpdater import updateFromMysql
from local.hotConfigHelper import getHotConfigDict
from util.logHelper import myLogging
# import logging
# logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',level=logging.INFO)
if __name__ == '__main__':
    timeStart = int(time.time() * 1000) - 24 * 3600 * 1000
    while 1:
        myLogging.info('begin searchHistoryCrawler')
        timeBeforeSearch = int(time.time() * 1000)
        try:
            crawlByDailySearchHistory(timeStart)
        except Exception as e:
            myLogging.error(traceback.format_exc())
        timeStart = timeBeforeSearch
        sleepTime = getHotConfigDict()['searchHistoryCrawler']['updateSleep']
        myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs')
        time.sleep(int(sleepTime))