def onlyInsertCap(queue): from DBUtils.PooledDB import PooledDB pool3 = PooledDB(creator=MySQLdb, mincached=1, maxcached=2, host=EADHOST, port=3306, user="******", passwd=EADPASSWD, db="dushu", use_unicode=True, charset='utf8') conn3 = pool3.connection() csor3 = conn3.cursor() # conn.set_character_set('utf8') csor3.execute('SET NAMES utf8') csor3.execute("SET CHARACTER SET utf8") csor3.execute("SET character_set_connection=utf8") while True: capObj = queue.get() # print i, capObj['source'] + capObj['idx'] try: capId = insertCapWithCapObj(capObj, conn3, csor3) if not capId: continue uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj)) global donedegest donedegest.add(capObj['digest']) except Exception as e: print 'insertCap error, ', e
def crawlCapsWithBookObj(allowUpdate, bookId, bookObj): ''' 根据book对象处理章节,新增或更新,会判断库中是否已有某章节 :param allowUpdate: 是否允许更新 :param bookId: shuqi的id :param bookObj: 库中的书信息 :return: ''' newChapNum = bookObj['chapterNum'] global donedegest capObjList = getCapObjsByBookObj(allowUpdate, bookId, bookObj) if not capObjList: print 'no capObjList, sid: ', bookId return newChapNum for capObj in capObjList: capId = insertCapWithCapObj2(capObj) donedegest.add(capObj['digest']) if not capId: newChapNum = min(newChapNum, capObj['idx'] + 1) continue uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj)) # existsCaps = getExistsCaps(bookObj['id']) return newChapNum
def handlChapByBookObjChapObj(allowUpdate, bookObj, chapObj): chapContentUrl = chapObj['url'] chapContent = getContentWithUA(chapContentUrl) chapContentObj = json.loads(chapContent) if not chapContentObj or not chapContentObj['content'] or len(chapContentObj['content']) < MinChapContentLength: myLogging.error('zid %s content too small skip, chapContentUrl %s', bookObj['id'], chapContentUrl) return 0 chapObj.update(chapContentObj) chapObj['title'] = chapObj['name'] chapObj['rawUrl'] = chapContentUrl chapObj['idx'] = int(chapObj['serialNumber']) del chapObj['serialNumber'] chapObj['size'] = len(chapObj['content']) chapObj['bookId'] = bookObj['id'] chapObj['source'] = bookObj['source'] chapObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, chapObj, chapObj['bookChapterId']) chapObj['digest'] = digest chapObj['content'] = textClean(chapObj['content']) capId = insertCapWithCapObj(chapObj, allowUpdate=allowUpdate) # aftInsertCap = time.time() # insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: myLogging.error('no chapId cid %s', chapObj['bookChapterId']) return 0 uploadJson2Bucket(str(chapObj['id']) + '.json', json.dumps(chapObj)) return chapObj['idx']
def fixNewLineByBookObjs(quanBenObjs): from parse.contentHelper import textClean for quanBenObj in quanBenObjs: bookId = quanBenObj['id'] chapIds = getCapIdsByBookId(bookId) for chapId in chapIds: try: url = ossBaseUrl + str(chapId) + '.json' r = requests.get(url) obj = json.loads(r.text) if not obj or not obj.has_key('content'): delCapById(chapId) myLogging.info('chap id %s, has no oss obj, delete', chapId) continue content = textClean(obj['content']) obj['content'] = content uploadJson2Bucket(str(chapId) + '.json', json.dumps(obj)) myLogging.info('succ cid %s', chapId) except Exception as e: myLogging.error('chap id %s, with exception: %s', chapId, traceback.format_exc())
def handleCapUpload(cap): cid = cap[0] capUrl = cap[2] bookId = cap[5] unclearContent = cap[4] capObj = dict() capObj['id'] = cap[0] capObj['title'] = cap[1] capObj['rawUrl'] = cap[2] capObj['source'] = cap[3] capObj['content'] = cap[4] capObj['bookId'] = cap[5] capObj['idx'] = cap[6] capObj['digest'] = cap[7] capObj['size'] = cap[8] capObj['bookUUID'] = cap[9] content = unclearContent if unclearContent and not (u' 言情小说_打造最新原创' in unclearContent or unclearContent == 'None'): uploadJson2Bucket(str(cid) + '.json', json.dumps(capObj)) else: try: if not capUrl or len(capUrl) < 1: print cid, 'no url, bookId : ', bookId else: if 'shuqireader' in capUrl: content = getContentByUrl(capUrl) # updateContentById(cid, content) else: content, host = getAndParse(capUrl) if not content: print cid, ' getAndparse content failed, bookId : ', bookId # continue # updateContentById(cid, content) # cap[4] = content capObj['content'] = content upload2Bucket(str(cid) + '.json', json.dumps(capObj)) except Exception as e: print 'cid ', cid, 'error: ', e except ValueError as er: print 'cid ', cid, 'error: ', er
def handlChapsByBookObjZidBocId(bookObj, zid, chapListObj, allowUpdate=False): # chapListObj = getChapsByBocId(bocId) resInx = 0 #保存最终更新到的下标 # chapListObj = getChapObjs(bookObj) if not chapListObj: myLogging.error('zid %s get chaps list null', zid) return resInx if not chapListObj.has_key('chapters'): myLogging.error('zid %s chaps list no data', zid) return resInx capIdxs = set() capTitles = set() if allowUpdate: capIdxs = getCapIdxsByBookId(bookObj['id']) # 已在库中的章节下标 capTitles = getChapTitlesByBookId(bookObj['id']) # 已在库中的章节下标 for idx in range(0, len(chapListObj['chapters'])): try: # if idx in capIdxs: # continue chapObj = chapListObj['chapters'][idx] if chapObj['title'] in capTitles: continue if idx in capIdxs: continue chapObj['cid'] = chapObj['link'] if chapObj.has_key('id'): chapObj['cid'] = chapObj['id'] chapObj['idx'] = idx chapContentUrl = ZSSQCHAPCONTENTBASEURL + quote(chapObj['link']) chapContentText = getContentWithUA(chapContentUrl) if not chapContentText: myLogging.error( 'zid: %s, dbid: %s, chapId: %s, get chapContent null ', zid, bookObj['id'], chapObj['cid']) continue chapContentObj = json.loads(chapContentText) if not chapContentObj or not chapContentObj.has_key('chapter'): myLogging.error( 'zid: %5, dbid: %s, chapId: %s, get no chapter ', zid, bookObj['id'], chapObj['cid']) continue if u'.' == chapContentObj['chapter']['title'] or len( chapContentObj['chapter']['title']) < 2: del chapContentObj['chapter']['title'] chapObj.update(chapContentObj['chapter']) chapObj['content'] = chapObj['body'] if chapObj.has_key('cpContent'): chapObj['content'] = chapObj['cpContent'] del chapObj['cpContent'] chapObj['content'] = textClean(chapObj['content']) if len(chapObj['content']) < MinChapContentLength: myLogging.error('zid %s cid %s content too small skip', zid, chapObj['cid']) continue del chapObj['body'] del chapObj['link'] chapObj['rawUrl'] = chapContentUrl # capObj['size'] = int(WordsCount) chapObj['size'] = len(chapObj['content']) chapObj['bookId'] = bookObj['id'] chapObj['source'] = bookObj['source'] chapObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, chapObj, chapObj['cid']) chapObj['digest'] = digest capId = insertCapWithCapObj(chapObj) # aftInsertCap = time.time() # insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: continue uploadJson2Bucket(str(capId) + '.json', json.dumps(chapObj)) resInx = max(resInx, idx) # aftUploadCap = time.time() # uploadCap = uploadCap + (aftUploadCap - aftInsertCap) except Exception as e: myLogging.error('zid: %, dbid: %s, idx: %s, get exception ', zid, bookObj['id'], idx) myLogging.error(traceback.format_exc()) return resInx
def handleCapsByBookObj(allowUpdate, bookObj, count, mid, startCapIdx = 1): capIdxs = set() if allowUpdate: capIdxs = getCapIdxsByBookId(bookObj['id']) # 已在库中的章节下标 # myBookId = bookObj['id'] # # startCap = time.time() crawlParseSpent = 0 insertCap = 0 uploadCap = 0 succCapTimes = 1 resIdx = startCapIdx for cid in range(0, count + 1): try: if allowUpdate: if cid in capIdxs: continue # 该章节已在库中,跳过 # else: # startCap = time.time() befCrawl = time.time() succCapTimes = succCapTimes + 1 # capContentUrl = MianFeiContentBaseUrl + str(cid) + '&contentid=' + str(mid) capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(bookObj['source']).mChapId( cid).mianfeiTXTSign().toUrl() capContent = getContentWithUA(capContentUrl, ua) if not capContent: capContent = getContentWithUA(capContentUrl, ua) # capContent = capContent.replace(r'\r', '').replace(r'\n', '') capListJsonObj = json.loads(capContent, strict=False) if not (capListJsonObj['returnCode'] == '0000'): capListJsonObj = json.loads(capContent) if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'): resIdx = min(cid, resIdx) myLogging.info('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid)) return resIdx # 原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出 capObj = dict() orgContent = capListJsonObj['data']['bookChapter']['content'] contentSoup = getSoupByStr(orgContent) if not contentSoup or '' == orgContent or len(orgContent) < 1: myLogging.error('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid)) resIdx = min(cid, resIdx) return resIdx #原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出 if contentSoup.body['style']: del contentSoup.body['style'] content = unicode(contentSoup.body).replace(u'<body>', '').replace(u'</body>', '').replace(u'\n\n', u'\n').replace( u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>') capObj['content'] = textClean(content) capObj['title'] = unicode(contentSoup.title.get_text()) capObj['rawUrl'] = capContentUrl # capObj['size'] = int(WordsCount) capObj['size'] = len(content) capObj['bookId'] = bookObj['id'] capObj['source'] = bookObj['source'] capObj['idx'] = cid capObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, capObj, cid) capObj['digest'] = digest befInsertCap = time.time() crawlParseSpent = crawlParseSpent + (befInsertCap - befCrawl) capId = insertCapWithCapObj(capObj) aftInsertCap = time.time() insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: continue uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj)) aftUploadCap = time.time() uploadCap = uploadCap + (aftUploadCap - aftInsertCap) resIdx = max(cid, resIdx) except Exception as e: myLogging.error('crawl' + str(mid) + ' cap ' + str(cid) + ' exception: ' + str(e)) resIdx = min(cid, resIdx) if succCapTimes > 1: succCapTimes = succCapTimes - 1 myLogging.info( 'crawlParse avg: ' + str(float(crawlParseSpent) / float(succCapTimes)) + \ ' insert avg: ' + str(float(insertCap) / float(succCapTimes)) + \ ' upload avg: ' + str(float(uploadCap) / float(succCapTimes))) return resIdx