def updateByBookObj(bookObj): source = int(bookObj['source'].replace('shuqi', '')) newBookObj, digest = getBookObjFromSQid(source) if not newBookObj: # delBookById(bookObj['id']) myLogging.error( 'shuqi book has been droped, plz consider to delete id: ' + str(bookObj['id']) + ' sid: ' + str(source)) return if newBookObj['chapterNum'] > bookObj['chapterNum']: newBookObj['id'] = bookObj['id'] newChapNum = crawlCapsWithBookObj(bookObj=newBookObj, bookId=source, allowUpdate=True) if newChapNum >= bookObj['chapterNum']: updateOneFieldByOneField('chapterNum', newChapNum, 'id', bookObj['id']) updateBoostWithUpdateTime(bookObj['id']) myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' + str(newChapNum - bookObj['chapterNum'])\ + ' chaps ') if u'连载' != newBookObj['bookType']: updateOneFieldByOneField('bookType', newBookObj['bookType'], 'id', bookObj['id']) myLogging.warning(newBookObj['title'].encode('utf-8') + newBookObj['bookType'].encode('utf-8')) else: myLogging.info(newBookObj['title'].encode('utf-8') + ' has unexcepted, please check. didnot update ') else: myLogging.info(newBookObj['title'].encode('utf-8') + ' no update ()')
def handlChapByBookObjChapObj(allowUpdate, bookObj, chapObj): chapContentUrl = chapObj['url'] chapContent = getContentWithUA(chapContentUrl) chapContentObj = json.loads(chapContent) if not chapContentObj or not chapContentObj['content'] or len(chapContentObj['content']) < MinChapContentLength: myLogging.error('zid %s content too small skip, chapContentUrl %s', bookObj['id'], chapContentUrl) return 0 chapObj.update(chapContentObj) chapObj['title'] = chapObj['name'] chapObj['rawUrl'] = chapContentUrl chapObj['idx'] = int(chapObj['serialNumber']) del chapObj['serialNumber'] chapObj['size'] = len(chapObj['content']) chapObj['bookId'] = bookObj['id'] chapObj['source'] = bookObj['source'] chapObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, chapObj, chapObj['bookChapterId']) chapObj['digest'] = digest chapObj['content'] = textClean(chapObj['content']) capId = insertCapWithCapObj(chapObj, allowUpdate=allowUpdate) # aftInsertCap = time.time() # insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: myLogging.error('no chapId cid %s', chapObj['bookChapterId']) return 0 uploadJson2Bucket(str(chapObj['id']) + '.json', json.dumps(chapObj)) return chapObj['idx']
def fixNewLineByBookObjs(quanBenObjs): from parse.contentHelper import textClean for quanBenObj in quanBenObjs: bookId = quanBenObj['id'] chapIds = getCapIdsByBookId(bookId) for chapId in chapIds: try: url = ossBaseUrl + str(chapId) + '.json' r = requests.get(url) obj = json.loads(r.text) if not obj or not obj.has_key('content'): delCapById(chapId) myLogging.info('chap id %s, has no oss obj, delete', chapId) continue content = textClean(obj['content']) obj['content'] = content uploadJson2Bucket(str(chapId) + '.json', json.dumps(obj)) myLogging.info('succ cid %s', chapId) except Exception as e: myLogging.error('chap id %s, with exception: %s', chapId, traceback.format_exc())
def mianfeiUpdateByBookObj(bookObj, maxChapNum=0): mid = bookObj['source'] newBookObj, newChapNum = crawlCurrentBookObj(mid) if not newBookObj: myLogging.error( 'mid %s with dbId %s get None currentBookObj, plz check', mid, bookObj['id']) return latestCapIndex = newBookObj['latestCapIndex'] newChapNum = max(newChapNum, latestCapIndex, maxChapNum) if newChapNum >= bookObj['chapterNum']: resIdx = handleCapsByBookObj(allowUpdate=True, bookObj=bookObj, count=newChapNum, mid=mid, startCapIdx=bookObj['chapterNum']) if resIdx > bookObj['chapterNum']: updateOneFieldByOneField('chapterNum', resIdx, 'id', bookObj['id']) updateBoostWithUpdateTime(bookObj['id']) myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' \ + str(resIdx - bookObj['chapterNum']) + ' chaps (mianTxt) ') if u'连载' != newBookObj['bookType']: updateOneFieldByOneField('bookType', newBookObj['bookType'], 'id', bookObj['id']) myLogging.info(newBookObj['title'].encode('utf-8') + newBookObj['bookType']) else: myLogging.info(newBookObj['title'].encode('utf-8') + ' no update (mianTxt)')
def updateFromMysql(st=10000, end=7000000): ''' 永远运行,从数据库中查询出于连载状态的小说,进行更新 ''' idx = st carry = 10000 myLogging.info('start from %s to %s ', st, end) while idx < end: # seq = range(5000, 6000) seq = range(idx, idx + carry) random.shuffle(seq) # for sqBid in seq: # print sqBid # if sqBid in nullIdSet: # continue if not srcIdBloom.contains('shuqi' + str(sqBid)): try: num = start(sqBid, allowUpdate=False) if num and num > 0: srcIdBloom.add('shuqi' + str(sqBid)) # start(17043) except Exception as e: myLogging.error('shuqi sid: %s , has exception %s', str(sqBid), traceback.format_exc()) except IOError as e2: myLogging.error('shuqi sid: %s , has exception %s', str(sqBid), traceback.format_exc()) idx = idx + carry
def updateByBookObj(bookObj): source = bookObj['source'] [zid, zBocId] = source.split('/') currentChapsObj = getChapsByBocId(zBocId) if not currentChapsObj or not currentChapsObj.has_key('chapters') or len( currentChapsObj['chapters']) < 1: # delBookById(bookObj['id']) myLogging.error( 'zssq book maybe have been droped, plz consider to delete id: ' + str(bookObj['id']) + ' sid: ' + str(source)) return currentChapNum = len(currentChapsObj['chapters']) if currentChapNum > bookObj['chapterNum']: newIdx = handlChapsByBookObjZidBocId(bookObj, zid, currentChapsObj, allowUpdate=True) if newIdx >= bookObj['chapterNum']: #newIdx下标从1开始的 updateOneFieldByOneField('chapterNum', newIdx + 1, 'id', bookObj['id']) updateBoostWithUpdateTime(bookObj['id']) myLogging.info('zid: %s, bookId: %s update %s chaps ', zid, bookObj['id'], str(newIdx + 1 - bookObj['chapterNum'])) else: myLogging.info('zid: %s, bookId: %s no update ()', zid, bookObj['id'])
def getSourceId(qid): srcUrl = srcListBaseUrl % str(qid) srcListContent = getContentWithUA(srcUrl) if not srcListContent: return srcJsonObj = json.loads(srcListContent) if not srcJsonObj or not srcJsonObj.has_key('items'): myLogging.error('no srcObj items qid %s', qid) return srcItems = srcJsonObj['items'] if len(srcItems.keys()) < 1: myLogging.error(' srcObj items len < 1 qid %s', qid) return if srcItems.has_key('api.zhuishuwang.com'): return srcItems['api.zhuishuwang.com'][0]['book_source_id'] # updateTIme = 0 # resId = '' # for itmkey in srcItems.keys(): # if srcItems[itmkey][0]['update_time'] > updateTIme: # resId = srcItems[itmkey][0]['book_source_id'] # updateTIme = srcItems[itmkey][0]['update_time'] # # return resId raise InputException('no zhuishuwang source, skip')
def searchAndCrawl(searchInput, limit=5): searchResObj = search(searchInput) succcount = 0 count = 0 for bookObj in searchResObj['books']: count += 1 if count > 5: #只要搜索结果的前N个,后面的就算了 break digest = getBookDigest(bookObj) if bookDigestBloom.contains(digest): myLogging.info('has book %s, with same author %s, skip', bookObj['title'].encode('utf-8'), bookObj['author'].encode('utf-8')) continue zid = bookObj['_id'] try: startByZid(zid, allowUpdate=False) except Exception as e: myLogging.error('zid %s has exception: %s', zid, traceback.format_exc()) succcount += 1 if succcount > limit: #最多抓取图书数量 break
def qichachaFromIndustry(f, t): myLogging.info('start from %s to %s ', f, t) indBaseUrl = 'http://www.qichacha.com/gongsi_industry?industryCode=' conn, csor = getComConnCsor() for code in range(f, t + 1): industCode = chr(code + 65) industOrder = code inductBasePageUrl = indBaseUrl + industCode + '&industryorder=' + str( industOrder) try: myLogging.info('start indust base pages, %s', inductBasePageUrl) # qichachaFromIndustPageUrl(inductBasePageUrl,conn, csor) myLogging.info('end indust base pages, %s', inductBasePageUrl) myLogging.info('start indust subIndust pages, %s', inductBasePageUrl) pageContent = getQichachaHtml(inductBasePageUrl) pageSoup = getSoupByStrEncode(pageContent, 'utf-8') subUrlTags = pageSoup.select('.filter-tag')[1] if not subUrlTags: myLogging.error('no subUrls, skipped, %s', inductBasePageUrl) for tag in subUrlTags.select('a'): subUri = tag['href'] subUrl = urlparse.urljoin(indBaseUrl, subUri) myLogging.info('start sub indust base pages, %s', subUrl) qichachaFromIndustPageUrl(subUrl, conn, csor) myLogging.info('end sub indust base pages, %s', subUrl) except Exception as e: myLogging.error('indust error, industCode: %s url: %s; error: %s ', industCode, inductBasePageUrl, e)
def mianfeiTxtUpdateFromMysql(): bookObjs = getMianAllBookObjs() for bookObj in bookObjs: try: mianfeiUpdateByBookObj(bookObj, maxChapNum=0) except Exception as e: myLogging.error('mianTxt update book ' + str(bookObj['id']) + ' raise exception ') myLogging.error(traceback.format_exc())
def qichachaFromIndustPageUrl(url, conn, csor): baseUrl = url.replace('?', '_').replace('&', '_').replace('=', '_') + '_p_' for pageCount in range(1, 501): pageUrl = baseUrl + str(pageCount) + '.shtml' try: pageContent = getQichachaHtml(pageUrl) pageSoup = getSoupByStrEncode(pageContent, 'utf-8') dealUIDsBySoup(conn, csor, pageCount, pageSoup, 'indust') except Exception as e: myLogging.error('page error, url: %s', pageUrl)
def updateFromMysql(): ''' 永远运行,从数据库中查询出于连载状态的小说,进行更新 ''' bookObjs = getZssqAllLianZaiBookObjs() for bookObj in bookObjs: try: updateByBookObj(bookObj) except Exception as e: myLogging.error('update book' + str(bookObj['id']) + ' raise exception ') myLogging.error(traceback.format_exc())
def qichachaFromProvs(provs): myLogging.info('start: provs %s', str(provs)) catBaseIrl = 'http://www.qichacha.com/gongsi_area_prov_' conn, csor = getComConnCsor() for prov in provs: pageBaseUrl = catBaseIrl + prov + '_p_' for pageCount in range(1, 501): pageUrl = pageBaseUrl + str(pageCount) + '.shtml' try: pageContent = getQichachaHtml(pageUrl) pageSoup = getSoupByStrEncode(pageContent, 'utf-8') dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov) except Exception as ee: myLogging.error('page ' + str(pageCount) + ' error %s', ee)
def insertWithUid(conn2, csor2, prv, uid): if uid in idBloom: print 'already crawled uid:', uid return # idBloom.add(uid) global conn, csor if not conn or (not csor): conn2, csor2 = getComConnCsor() com_base_info_str = getBaseInfoById(prv, uid) com_base_info_json = json.loads(com_base_info_str) if com_base_info_json['status'] != 1: print 'json int not succ , uid: ', uid, ' content:', com_base_info_str return data = com_base_info_json['data']['Company'] companyType = data['EconKind'] # webName = data['webName'] companyName = data['Name'] liscense = data['No'] if not liscense: liscense = data['OrgNo'] examineDate = '' if data['CheckDate']: examineDate = data['CheckDate'].strip() # webSite = ','.join(data['webSite']) # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName) global staticInsertTotolCount, staticInsertTotolTime, staticInsertCarry startTime = time.time() try: csor2.execute( """insert ignore into com_base_copy (id,companyName,companyType,examineDate,liscense,source,src_content) values (%s,%s,%s,%s,%s,%s,%s);""", (uid, companyName, companyType, examineDate, liscense, "qichacha", com_base_info_str)) conn2.commit() myLogging.info('comOk, uid: %s, comName: %s', uid, unicode(companyName).encode('utf-8')) endTime = time.time() thisSpentTime = endTime - startTime statisMysqlInsert(staticInsertCarry, thisSpentTime) except Exception as e: myLogging.error('insert error, uid: %s, error:%s', uid, e)
def handleChapsByBookObj(bookObj, zid, allowUpdate=False): # zid = bookObj['source'] bocObjs = getBocObjsByZid(zid) sourceCount = 0 for bocIdx in range(0, len(bocObjs)): bocObj = bocObjs[bocIdx] bocId = bocObj['_id'] try: bocSource = bocObj['source'] if 'zhuishuvip' == bocSource: continue bookObj['source'] = zid + '/' + bocId bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str( zid) + "?source=" + str(bocId) chapListObj = getChapsByBocId(bocId) bookObj['chapterNum'] = min(bookObj['chapterNum'], len(chapListObj['chapters'])) if bookObj['chapterNum'] <= MINCHAPNUM: continue bookObj = insertBookWithConn(bookObj, allowUpdate) resInx = handlChapsByBookObjZidBocId(bookObj, zid, chapListObj, allowUpdate) if resInx <= MINCHAPNUM: myLogging.info( 'zid %s dbid %s crawl too small chapNum, delete ', zid, bookObj['id']) delBookById(bookObj['id']) sourceCount += 1 if sourceCount >= sourceLimit: myLogging.info('zid: %s crawl source to sourceLimit', zid) break else: # bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str(zid) + "?source=" + str(bocId) # bookObj = parseInsertBook(allowUpdate, bookObj, zid) #重新插入另外一个源的书 myLogging.info('zid: %s crawl another source %s', zid, bocId) except Exception as e: myLogging.error('zid: %s ,bocId %s get exception ', zid, bocId) myLogging.error(traceback.format_exc())
def insertCapWithCapObj(capObj, conn2=None, csor2=None, allowUpdate=False): if not conn2 or not csor2: conn2, csor2 = getDushuConnCsor() # sql = "insert ignore cn_dushu_acticle (title,rawUrl,source,content,bookId,idx,digest,size,bookUUID) values" \ # "('%s','%s','%s','%s',%d,%d,'%s', %d, '%s')" % ( # capObj['title'], capObj['rawUrl'], capObj['source'], capObj['content'] # , capObj['bookId'], capObj['idx'], capObj['digest'], capObj['size'], capObj['bookUUID']) try: csor2.execute("insert cn_dushu_acticle (bookId,idx,digest,bookUUID,title,size) values" \ "(%s,%s,%s,%s,%s,%s)" , (capObj['bookId'], capObj['idx'], capObj['digest'], capObj['bookUUID'], capObj['title'], capObj['size'])) # csor2.execute("update cn_dushu_acticle set title = %s, size= %s where digest = %s" , (capObj['title'], capObj['size'], capObj['digest'] )) conn2.commit() myLogging.info('scap, ' + ":" + str(capObj['idx'])) # , ', content: ', capObj['content'][0:15] except Exception as e: # # 发生错误时回滚 myLogging.error(e) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error(ee) if not allowUpdate: return None try: csor2.execute( "select id,bookId from cn_dushu_acticle where digest = %s;", (capObj['digest'], )) conn2.commit() sqlObj = csor2.fetchone() capId = sqlObj[0] bookId = sqlObj[1] if bookId != capObj['bookId']: myLogging.info('update bookId' + str(capId)) # 如果已存在,且bookId不对,更新下,防止错误cap占坑 csor2.execute( "update cn_dushu_acticle set bookId = %s where id = %s;", (capObj['bookId'], capId)) conn2.commit() capObj['id'] = capId return capId except Exception as e: # # 发生错误时回滚 myLogging.error(e) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error(ee) return None csor2.close() conn2.close()
def delCapById(cid): conn2, csor2 = getDushuConnCsor() try: csor2.execute("delete from " + db_acticle + " where id = %s", (cid, )) conn2.commit() except Exception as e: # # 发生错误时回滚 myLogging.error('mysql ex: ' + str(e)) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error('rollback error : ' + str(cid)) csor2.close() conn2.close()
def fromTopNBookIds(): fbids = open('top5wBookIds.txt') bidsSet = set() while 1: bid = fbids.readline() if not bid: break bid = bid.replace('\n', '') bidsSet.add(bid) print 'load to dict uniq ' + str(len(bidsSet)) for bid in bidsSet: try: QuanBenCrawler(bid).crawl(allowUpdate=False) except Exception as e: myLogging.error( 'bookId %s has exception: ' + traceback.format_exc(), bid)
def insertBookWithConn(bookObj, allowUpdate=True, conn2=None, csor2=None): if not conn2 or not csor2: conn2, csor2 = getDushuConnCsor() userId = random.randint(1, 50) updateTime = int(time.time()) digest = getBookDigest(bookObj) bookObj['digest'] = digest #统一清理操作 bookObj['subtitle'] = subTitleClean(bookObj['subtitle']) if not bookObj.has_key('source'): bookObj['source'] = '' try: csor2.execute('insert ' + db_dushu + '(categoryCode,typeCode,category,type,userId,title,subtitle,imgUrl,author,updateTime' \ ",rawUrl,source,digest,status,viewNum, chapterNum, bookType, size) values" \ "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)" \ , (bookObj['categoryCode'],bookObj['typeCode'], bookObj['category'], bookObj['type'], userId,bookObj['title'] ,bookObj['subtitle'],bookObj['imgUrl'],bookObj['author'],updateTime, bookObj['rawUrl'] ,bookObj['source'],digest, 11,bookObj['viewNum'],bookObj['chapterNum'],bookObj['bookType'],bookObj['size'])) # csorDoc.execute('update cn_dushu_book set subtitle = %s where digest = %s' # , (bookObj['subtitle'],digest)) conn2.commit() myLogging.info('succ book, ' + unicode(bookObj['title']).encode('utf-8')) except Exception, e: # # 发生错误时回滚 myLogging.warning('update rollback; maybe exists, err: %s', traceback.format_exc()) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error('rollback error : ' + bookObj['rawUrl']) if u'完结' == bookObj['bookType']: updateBookTypeByRawUrl(bookObj['bookType'], bookObj['rawUrl']) # return None #有bug if not allowUpdate: return None
def startByZid(zid, allowUpdate=False): bookObj = getBookObjBiZid(zid) if not bookObj: myLogging.error('zid %s get bookObj null', zid) return bookObj = parseBook(allowUpdate, bookObj, zid) # bocObjs = getBocObjsByZid(zid) # bookObj['source'] = zid + '/' + bocId # bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str(zid) # bookObj = insertBookWithConn(bookObj, allowUpdate) if not bookObj: myLogging.error('zid %s parse and insert bookObj null', zid) return handleChapsByBookObj(bookObj, zid, allowUpdate)
def delBookById(bookId): conn2, csor2 = getDushuConnCsor() #将删除的书记录到deleted_ids表中 csor2.execute('select rawUrl,source from ' + db_dushu + ' where id = %s', (bookId, )) conn2.commit() bookObj = csor2.fetchone() if not bookObj: return rawUrl = bookObj[0] source = bookObj[1].replace('shuqi', '') if 'shuqireader' in rawUrl: csor2.execute( 'insert into shuqi_deleted_ids (id, sid) VALUEs (%s, %s)', (bookId, source)) conn2.commit() elif 'yingyangcan' in rawUrl: csor2.execute( 'insert into mianfei_deleted_ids (id, mid) VALUEs (%s, %s)', (bookId, source)) conn2.commit() bookId = int(bookId) # sql = "delete from " + db_dushu + " where id = %d" % bookId try: csor2.execute("delete from " + db_dushu + " where id = %s", (bookId, )) conn2.commit() except Exception as e: # # 发生错误时回滚 myLogging.error(e) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error(e) deleteChapsLargerThanIdx(bookId, -1) #删除所有章节 csor2.close() conn2.close()
def updateContentById(id, content): conn, csor = getDushuConnCsor() # sql = "update cn_dushu_acticle set content = %s where id = %s " % (content, str(id)) try: csor.execute("update cn_dushu_acticle set content = %s where id = %s ", (content, id)) conn.commit() myLogging.info(str(id) + ' succ cap, ' + content[0:15]) except Exception as e: # # 发生错误时回滚 myLogging.error(e) if conn: try: conn.rollback() except Exception as ee: myLogging.error(ee) csor.close() conn.close()
def searchAndCrawlByName(comName, proxy=None): if not comName: return None comName = comName.encode('utf-8') # baseUrl = 'http://www.qichacha.com/search?key=' + quote(comName) # baseUrl = 'http://www.qichacha.com/firm_CN_ea3a783f0c010fc31a2d75c2c9aa9b75' baseUrl = 'http://www.qichacha.com/search?key=%E5%B0%8F%E7%B1%B3' ua = random.choice(USER_AGENTS) htmlContent = getQichachaHtml(baseUrl, noCookie=True) if not htmlContent: return None soup = getSoupByStrEncode(htmlContent) if not soup.select('ul.list-group a') or len( soup.select('ul.list-group a')) < 1: myLogging.debug(htmlContent) return None for uidTag in soup.select('ul.list-group a'): uid = uidTag['href'].replace('firm_', '') if uid == uidTag['href']: myLogging.warning('not uid, skip %s', uidTag['href']) continue uid = uid.replace('.shtml', '').replace('/', '') prv = None if '_' in uid: strs = uid.split('_') prv = strs[0] uid = strs[1] # comName = uidTag.select_one('.text-lg').get_text() # comObj = dict() # comObj['uid'] = uid # comObj['comName'] = comName try: insertWithUid(conn, csor, prv, uid) except Exception as e: myLogging.error('insert with uid fail, uid: %s', uid) # print comLink return 'ok'
def dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov): uidList = pageSoup.select('.list-group-item') if len(uidList) < 1: myLogging.error('no com list, skip %s page: %s', prov, pageCount) return # continue for uidTag in uidList: try: if not uidTag.has_attr('href'): myLogging.error('no com Tag, skip %s page: %s; tag: %s', prov, pageCount, uidTag) # continue return prv = None uid = uidTag['href'].replace('firm_', '').replace('.shtml', '').replace('/', '') if '_' in uid: strs = uid.split('_') prv = strs[0] uid = strs[1] if uid in idBloom: myLogging.info('already crawled, skip uid: %s', uid) continue insertWithUid(conn, csor, prv, uid) except Exception as ee: myLogging.error('uid: %s error: %s', uid, ee)
def getExistsCapsRawUrlId(bookId): conn, csor = getDushuConnCsor() checkCapsSql = 'select id,rawUrl from cn_dushu_acticle where bookId = %d' % ( bookId) try: csor.execute(checkCapsSql) conn.commit() results = csor.fetchall() if not results or len(results) < 1: myLogging.warning('no caps,, bookId:' + str(bookId)) return None else: return results except Exception as e: # # 发生错误时回滚 myLogging.error(e) csor.close() conn.close()
def updateFromMysql(): ''' 永远运行,从数据库中查询出于连载状态的小说,进行更新 ''' # crawlInput = dict() # crawlInput['crawlerName'] = 'shuqiById' # data = dict() # crawlInput['data'] = data # # conn,csor = getDushuConnCsor() # # csor.execute("SELECT source from cn_dushu_book where operateStatus = 0 AND bookType = '连载' and rawUrl like 'http://api.shuqireader.com/reader/bc_cover.php%';") # conn.commit() # # ss = csor.fetchall() # for source in ss: # sid = source[0].replace('shuqi', '') # if '' == sid: # continue # data['sid'] = sid # # # try: # r = requests.post('http://0.0.0.0:10008/simpleCrawler', data = json.dumps(crawlInput)) # # print 'dine id: ',sid, 'with response: ', # except Exception as e: # print 'sid: ',sid, ' done with exception: ', e.message bookObjs = getShuqiAllLianZaiBookObjs() for bookObj in bookObjs: try: updateByBookObj(bookObj) except Exception as e: myLogging.error('update book' + str(bookObj['id']) + ' raise exception ') myLogging.error(traceback.format_exc())
def changeSouceIds(): bookObjs = getMianAllBookBaseObjs() for bookObj in bookObjs: try: foundNewId = False title = bookObj['title'] author = bookObj['author'] source = bookObj['source'] bookId = bookObj['id'] searchUrl = MianFeiTXTSearchBaseUrl + '?' + paramMap().mianfeiTXT()\ .put('keyword', (title + author).encode('utf-8'))\ .put('pageSize', '10').put('pageNum', '1').put('type', '1')\ .mianfeiTXTSign() \ .toUrl() # time.sleep(random.) r = requests.get(searchUrl) searchRes = json.loads(r.text) for resBook in searchRes['data']['books']: resTitle = resBook['name'] if resTitle != title: continue resAuthor = resBook['author'] if resAuthor != author: continue resId = resBook['id'] if str(resId) == str(source): myLogging.info('WTF: id no change?, bookId: %s, orgSoueceId: %s, newId: %s', bookId, source, resId) latestChapObj = getLatestChapByBookId(bookId) if not latestChapObj: myLogging.error('no chaps in db yet, bookId: %s, new mid: %s', bookId, resId) updateOneFieldByOneField('source', resId, 'id', bookId) foundNewId = True break cid = latestChapObj['idx'] chapTitle = latestChapObj['title'] capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(resId).mChapId( cid).mianfeiTXTSign().toUrl() capContent = getContentWithUA(capContentUrl) if not capContent: capContent = getContentWithUA(capContentUrl) # capContent = capContent.replace(r'\r', '').replace(r'\n', '') capListJsonObj = json.loads(capContent, strict=False) if not (capListJsonObj['returnCode'] == '0000'): capListJsonObj = json.loads(capContent) if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'): myLogging.error('get chap detail fail mid: %s, cid: %s', resId, cid) continue chapterName = capListJsonObj['data']['bookChapter']['chapterName'] if chapterName == chapTitle: myLogging.info('bookId %s change source from %s to %s', bookId, source, resId) updateOneFieldByOneField('source', resId, 'id', bookId) foundNewId = True break if not foundNewId: myLogging.error('bookId %s did not find new id !!!,title: %s, author: %s, org source: %s', bookId, title, author,source ) except Exception as e: myLogging.error(traceback.format_exc())
def handlChapsByBookObjZidBocId(bookObj, zid, chapListObj, allowUpdate=False): # chapListObj = getChapsByBocId(bocId) resInx = 0 #保存最终更新到的下标 # chapListObj = getChapObjs(bookObj) if not chapListObj: myLogging.error('zid %s get chaps list null', zid) return resInx if not chapListObj.has_key('chapters'): myLogging.error('zid %s chaps list no data', zid) return resInx capIdxs = set() capTitles = set() if allowUpdate: capIdxs = getCapIdxsByBookId(bookObj['id']) # 已在库中的章节下标 capTitles = getChapTitlesByBookId(bookObj['id']) # 已在库中的章节下标 for idx in range(0, len(chapListObj['chapters'])): try: # if idx in capIdxs: # continue chapObj = chapListObj['chapters'][idx] if chapObj['title'] in capTitles: continue if idx in capIdxs: continue chapObj['cid'] = chapObj['link'] if chapObj.has_key('id'): chapObj['cid'] = chapObj['id'] chapObj['idx'] = idx chapContentUrl = ZSSQCHAPCONTENTBASEURL + quote(chapObj['link']) chapContentText = getContentWithUA(chapContentUrl) if not chapContentText: myLogging.error( 'zid: %s, dbid: %s, chapId: %s, get chapContent null ', zid, bookObj['id'], chapObj['cid']) continue chapContentObj = json.loads(chapContentText) if not chapContentObj or not chapContentObj.has_key('chapter'): myLogging.error( 'zid: %5, dbid: %s, chapId: %s, get no chapter ', zid, bookObj['id'], chapObj['cid']) continue if u'.' == chapContentObj['chapter']['title'] or len( chapContentObj['chapter']['title']) < 2: del chapContentObj['chapter']['title'] chapObj.update(chapContentObj['chapter']) chapObj['content'] = chapObj['body'] if chapObj.has_key('cpContent'): chapObj['content'] = chapObj['cpContent'] del chapObj['cpContent'] chapObj['content'] = textClean(chapObj['content']) if len(chapObj['content']) < MinChapContentLength: myLogging.error('zid %s cid %s content too small skip', zid, chapObj['cid']) continue del chapObj['body'] del chapObj['link'] chapObj['rawUrl'] = chapContentUrl # capObj['size'] = int(WordsCount) chapObj['size'] = len(chapObj['content']) chapObj['bookId'] = bookObj['id'] chapObj['source'] = bookObj['source'] chapObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, chapObj, chapObj['cid']) chapObj['digest'] = digest capId = insertCapWithCapObj(chapObj) # aftInsertCap = time.time() # insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: continue uploadJson2Bucket(str(capId) + '.json', json.dumps(chapObj)) resInx = max(resInx, idx) # aftUploadCap = time.time() # uploadCap = uploadCap + (aftUploadCap - aftInsertCap) except Exception as e: myLogging.error('zid: %, dbid: %s, idx: %s, get exception ', zid, bookObj['id'], idx) myLogging.error(traceback.format_exc()) return resInx
def handleCapsByBookObj(allowUpdate, bookObj, count, mid, startCapIdx = 1): capIdxs = set() if allowUpdate: capIdxs = getCapIdxsByBookId(bookObj['id']) # 已在库中的章节下标 # myBookId = bookObj['id'] # # startCap = time.time() crawlParseSpent = 0 insertCap = 0 uploadCap = 0 succCapTimes = 1 resIdx = startCapIdx for cid in range(0, count + 1): try: if allowUpdate: if cid in capIdxs: continue # 该章节已在库中,跳过 # else: # startCap = time.time() befCrawl = time.time() succCapTimes = succCapTimes + 1 # capContentUrl = MianFeiContentBaseUrl + str(cid) + '&contentid=' + str(mid) capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(bookObj['source']).mChapId( cid).mianfeiTXTSign().toUrl() capContent = getContentWithUA(capContentUrl, ua) if not capContent: capContent = getContentWithUA(capContentUrl, ua) # capContent = capContent.replace(r'\r', '').replace(r'\n', '') capListJsonObj = json.loads(capContent, strict=False) if not (capListJsonObj['returnCode'] == '0000'): capListJsonObj = json.loads(capContent) if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'): resIdx = min(cid, resIdx) myLogging.info('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid)) return resIdx # 原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出 capObj = dict() orgContent = capListJsonObj['data']['bookChapter']['content'] contentSoup = getSoupByStr(orgContent) if not contentSoup or '' == orgContent or len(orgContent) < 1: myLogging.error('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid)) resIdx = min(cid, resIdx) return resIdx #原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出 if contentSoup.body['style']: del contentSoup.body['style'] content = unicode(contentSoup.body).replace(u'<body>', '').replace(u'</body>', '').replace(u'\n\n', u'\n').replace( u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>') capObj['content'] = textClean(content) capObj['title'] = unicode(contentSoup.title.get_text()) capObj['rawUrl'] = capContentUrl # capObj['size'] = int(WordsCount) capObj['size'] = len(content) capObj['bookId'] = bookObj['id'] capObj['source'] = bookObj['source'] capObj['idx'] = cid capObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, capObj, cid) capObj['digest'] = digest befInsertCap = time.time() crawlParseSpent = crawlParseSpent + (befInsertCap - befCrawl) capId = insertCapWithCapObj(capObj) aftInsertCap = time.time() insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: continue uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj)) aftUploadCap = time.time() uploadCap = uploadCap + (aftUploadCap - aftInsertCap) resIdx = max(cid, resIdx) except Exception as e: myLogging.error('crawl' + str(mid) + ' cap ' + str(cid) + ' exception: ' + str(e)) resIdx = min(cid, resIdx) if succCapTimes > 1: succCapTimes = succCapTimes - 1 myLogging.info( 'crawlParse avg: ' + str(float(crawlParseSpent) / float(succCapTimes)) + \ ' insert avg: ' + str(float(insertCap) / float(succCapTimes)) + \ ' upload avg: ' + str(float(uploadCap) / float(succCapTimes))) return resIdx
#!/usr/bin/python # -*- coding: UTF-8 -*- ''' @author: zyq ''' import time import traceback from app.SearchHistoryCrawler import crawlByDailySearchHistory from app.shuqiUpdater import updateFromMysql from local.hotConfigHelper import getHotConfigDict from util.logHelper import myLogging # import logging # logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',level=logging.INFO) if __name__ == '__main__': timeStart = int(time.time() * 1000) - 24 * 3600 * 1000 while 1: myLogging.info('begin searchHistoryCrawler') timeBeforeSearch = int(time.time() * 1000) try: crawlByDailySearchHistory(timeStart) except Exception as e: myLogging.error(traceback.format_exc()) timeStart = timeBeforeSearch sleepTime = getHotConfigDict()['searchHistoryCrawler']['updateSleep'] myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs') time.sleep(int(sleepTime))