def cleanSubtitle(): conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) bookId = 2584584 carry = 50000 while bookId < 2590000: try: dictCsor.execute( 'select id,subtitle from ' + db_dushu + " where id >= %s and id <= %s and subtitle REGEXP '[0-9]{5,20}'", (bookId, bookId + carry)) conn.commit() books = dictCsor.fetchallDict() for book in books: newSubtitle = subTitleClean(book['subtitle']) if not newSubtitle == book['subtitle'].encode('utf-8'): myLogging.info('bookId %s update from %s to %s', book['id'], book['subtitle'].encode('utf-8'), newSubtitle) updateOneFieldByOneField('subtitle', newSubtitle, 'id', book['id']) except Exception as e: myLogging.warning(e) bookId += carry chapObj = dictCsor.fetchoneDict() csor.close() conn.close()
def updateFromMysql(st=10000, end=7000000): ''' 永远运行,从数据库中查询出于连载状态的小说,进行更新 ''' idx = st carry = 10000 myLogging.info('start from %s to %s ', st, end) while idx < end: # seq = range(5000, 6000) seq = range(idx, idx + carry) random.shuffle(seq) # for sqBid in seq: # print sqBid # if sqBid in nullIdSet: # continue if not srcIdBloom.contains('shuqi' + str(sqBid)): try: num = start(sqBid, allowUpdate=False) if num and num > 0: srcIdBloom.add('shuqi' + str(sqBid)) # start(17043) except Exception as e: myLogging.error('shuqi sid: %s , has exception %s', str(sqBid), traceback.format_exc()) except IOError as e2: myLogging.error('shuqi sid: %s , has exception %s', str(sqBid), traceback.format_exc()) idx = idx + carry
def updateByBookObj(bookObj): source = bookObj['source'] [zid, zBocId] = source.split('/') currentChapsObj = getChapsByBocId(zBocId) if not currentChapsObj or not currentChapsObj.has_key('chapters') or len( currentChapsObj['chapters']) < 1: # delBookById(bookObj['id']) myLogging.error( 'zssq book maybe have been droped, plz consider to delete id: ' + str(bookObj['id']) + ' sid: ' + str(source)) return currentChapNum = len(currentChapsObj['chapters']) if currentChapNum > bookObj['chapterNum']: newIdx = handlChapsByBookObjZidBocId(bookObj, zid, currentChapsObj, allowUpdate=True) if newIdx >= bookObj['chapterNum']: #newIdx下标从1开始的 updateOneFieldByOneField('chapterNum', newIdx + 1, 'id', bookObj['id']) updateBoostWithUpdateTime(bookObj['id']) myLogging.info('zid: %s, bookId: %s update %s chaps ', zid, bookObj['id'], str(newIdx + 1 - bookObj['chapterNum'])) else: myLogging.info('zid: %s, bookId: %s no update ()', zid, bookObj['id'])
def searchAndCrawl(searchInput, limit=5): searchResObj = search(searchInput) succcount = 0 count = 0 for bookObj in searchResObj['books']: count += 1 if count > 5: #只要搜索结果的前N个,后面的就算了 break digest = getBookDigest(bookObj) if bookDigestBloom.contains(digest): myLogging.info('has book %s, with same author %s, skip', bookObj['title'].encode('utf-8'), bookObj['author'].encode('utf-8')) continue zid = bookObj['_id'] try: startByZid(zid, allowUpdate=False) except Exception as e: myLogging.error('zid %s has exception: %s', zid, traceback.format_exc()) succcount += 1 if succcount > limit: #最多抓取图书数量 break
def dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov): uidList = pageSoup.select('.list-group-item') if len(uidList) < 1: myLogging.error('no com list, skip %s page: %s', prov, pageCount) return # continue for uidTag in uidList: try: if not uidTag.has_attr('href'): myLogging.error('no com Tag, skip %s page: %s; tag: %s', prov, pageCount, uidTag) # continue return prv = None uid = uidTag['href'].replace('firm_', '').replace('.shtml', '').replace('/', '') if '_' in uid: strs = uid.split('_') prv = strs[0] uid = strs[1] if uid in idBloom: myLogging.info('already crawled, skip uid: %s', uid) continue insertWithUid(conn, csor, prv, uid) except Exception as ee: myLogging.error('uid: %s error: %s', uid, ee)
def indexBookSuggest(st=218289): myLogging.info('st: %s', st) conn2, csor2 = getDushuConnCsor() csor2.execute( "select id,title,author from cn_dushu_book where id >= %s and operateStatus = 0 ", (st, )) conn2.commit() results = csor2.fetchall() baseUrl = DUSHU_SUGGEST_URL for book in results: id = book[0] title = book[1] author = book[2] # tags = book[3] bookObj = dict() sinput = [] sinput.append(title) sinput.append(author) # if tags: # ts = json.loads(tags) # for t in ts: # sinput.append(t) inputBoj = dict() inputBoj['input'] = sinput inputBoj['output'] = title + "(" + author + ')' bookObj['testsuggest'] = inputBoj try: r = requests.put(baseUrl + str(id), data=json.dumps(bookObj)) print r.text except Exception as e: print bookObj, e
def fixNewLineByBookObjs(quanBenObjs): from parse.contentHelper import textClean for quanBenObj in quanBenObjs: bookId = quanBenObj['id'] chapIds = getCapIdsByBookId(bookId) for chapId in chapIds: try: url = ossBaseUrl + str(chapId) + '.json' r = requests.get(url) obj = json.loads(r.text) if not obj or not obj.has_key('content'): delCapById(chapId) myLogging.info('chap id %s, has no oss obj, delete', chapId) continue content = textClean(obj['content']) obj['content'] = content uploadJson2Bucket(str(chapId) + '.json', json.dumps(obj)) myLogging.info('succ cid %s', chapId) except Exception as e: myLogging.error('chap id %s, with exception: %s', chapId, traceback.format_exc())
def insertCapWithCapObj(capObj, conn2=None, csor2=None, allowUpdate=False): if not conn2 or not csor2: conn2, csor2 = getDushuConnCsor() # sql = "insert ignore cn_dushu_acticle (title,rawUrl,source,content,bookId,idx,digest,size,bookUUID) values" \ # "('%s','%s','%s','%s',%d,%d,'%s', %d, '%s')" % ( # capObj['title'], capObj['rawUrl'], capObj['source'], capObj['content'] # , capObj['bookId'], capObj['idx'], capObj['digest'], capObj['size'], capObj['bookUUID']) try: csor2.execute("insert cn_dushu_acticle (bookId,idx,digest,bookUUID,title,size) values" \ "(%s,%s,%s,%s,%s,%s)" , (capObj['bookId'], capObj['idx'], capObj['digest'], capObj['bookUUID'], capObj['title'], capObj['size'])) # csor2.execute("update cn_dushu_acticle set title = %s, size= %s where digest = %s" , (capObj['title'], capObj['size'], capObj['digest'] )) conn2.commit() myLogging.info('scap, ' + ":" + str(capObj['idx'])) # , ', content: ', capObj['content'][0:15] except Exception as e: # # 发生错误时回滚 myLogging.error(e) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error(ee) if not allowUpdate: return None try: csor2.execute( "select id,bookId from cn_dushu_acticle where digest = %s;", (capObj['digest'], )) conn2.commit() sqlObj = csor2.fetchone() capId = sqlObj[0] bookId = sqlObj[1] if bookId != capObj['bookId']: myLogging.info('update bookId' + str(capId)) # 如果已存在,且bookId不对,更新下,防止错误cap占坑 csor2.execute( "update cn_dushu_acticle set bookId = %s where id = %s;", (capObj['bookId'], capId)) conn2.commit() capObj['id'] = capId return capId except Exception as e: # # 发生错误时回滚 myLogging.error(e) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error(ee) return None csor2.close() conn2.close()
def getBookObj(allowUpdate, mid): befBookObj = time.time() bookObj, count = crawlCurrentBookObj(mid) aftBookObj = time.time() bookObj = insertBookWithConn(bookObj, allowUpdate) # aftInsertBookObj = time.time() myLogging.info('crawl book spent' + str(aftBookObj - befBookObj) + ' secs; insert spent ' + str(time.time() - aftBookObj)) return bookObj, count
def getQichachaInvestDigests(): idbloom = getBloom() conn, csor = getComConnCsor() csor.execute('select uid from com_invest') ids = csor.fetchall() [idbloom.add(mid[0]) for mid in ids] # if ids[0][0] in idbloom: myLogging.info('load exists ids ok') return idbloom
def qichachaFromProvs(provs): myLogging.info('start: provs %s', str(provs)) catBaseIrl = 'http://www.qichacha.com/gongsi_area_prov_' conn, csor = getComConnCsor() for prov in provs: pageBaseUrl = catBaseIrl + prov + '_p_' for pageCount in range(1, 501): pageUrl = pageBaseUrl + str(pageCount) + '.shtml' try: pageContent = getQichachaHtml(pageUrl) pageSoup = getSoupByStrEncode(pageContent, 'utf-8') dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov) except Exception as ee: myLogging.error('page ' + str(pageCount) + ' error %s', ee)
def dailyLatestUpdate(): nanShengCategorys = (9, 18, 41, 50, 58, 69, 74, 82, 90, 93, 97, 111) nvShengCategorys = (1, 18, 26, 33, 50, 64, 97, 111) nanBooks = getLatestUpdateBooks(nanShengCategorys, limit=50) updateIdsByBooks(nanBooks, 'girlbest') myLogging.info('update boy latest ids to %s', nanBooks) nvBooks = getLatestUpdateBooks(nvShengCategorys, limit=50) updateIdsByBooks(nvBooks, 'boylastest') myLogging.info('update girl latest ids to %s', nvBooks)
def insertWithUid(conn2, csor2, prv, uid): if uid in idBloom: print 'already crawled uid:', uid return # idBloom.add(uid) global conn, csor if not conn or (not csor): conn2, csor2 = getComConnCsor() com_base_info_str = getBaseInfoById(prv, uid) com_base_info_json = json.loads(com_base_info_str) if com_base_info_json['status'] != 1: print 'json int not succ , uid: ', uid, ' content:', com_base_info_str return data = com_base_info_json['data']['Company'] companyType = data['EconKind'] # webName = data['webName'] companyName = data['Name'] liscense = data['No'] if not liscense: liscense = data['OrgNo'] examineDate = '' if data['CheckDate']: examineDate = data['CheckDate'].strip() # webSite = ','.join(data['webSite']) # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName) global staticInsertTotolCount, staticInsertTotolTime, staticInsertCarry startTime = time.time() try: csor2.execute( """insert ignore into com_base_copy (id,companyName,companyType,examineDate,liscense,source,src_content) values (%s,%s,%s,%s,%s,%s,%s);""", (uid, companyName, companyType, examineDate, liscense, "qichacha", com_base_info_str)) conn2.commit() myLogging.info('comOk, uid: %s, comName: %s', uid, unicode(companyName).encode('utf-8')) endTime = time.time() thisSpentTime = endTime - startTime statisMysqlInsert(staticInsertCarry, thisSpentTime) except Exception as e: myLogging.error('insert error, uid: %s, error:%s', uid, e)
def insertBookWithConn(bookObj, allowUpdate=True, conn2=None, csor2=None): if not conn2 or not csor2: conn2, csor2 = getDushuConnCsor() userId = random.randint(1, 50) updateTime = int(time.time()) digest = getBookDigest(bookObj) bookObj['digest'] = digest #统一清理操作 bookObj['subtitle'] = subTitleClean(bookObj['subtitle']) if not bookObj.has_key('source'): bookObj['source'] = '' try: csor2.execute('insert ' + db_dushu + '(categoryCode,typeCode,category,type,userId,title,subtitle,imgUrl,author,updateTime' \ ",rawUrl,source,digest,status,viewNum, chapterNum, bookType, size) values" \ "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)" \ , (bookObj['categoryCode'],bookObj['typeCode'], bookObj['category'], bookObj['type'], userId,bookObj['title'] ,bookObj['subtitle'],bookObj['imgUrl'],bookObj['author'],updateTime, bookObj['rawUrl'] ,bookObj['source'],digest, 11,bookObj['viewNum'],bookObj['chapterNum'],bookObj['bookType'],bookObj['size'])) # csorDoc.execute('update cn_dushu_book set subtitle = %s where digest = %s' # , (bookObj['subtitle'],digest)) conn2.commit() myLogging.info('succ book, ' + unicode(bookObj['title']).encode('utf-8')) except Exception, e: # # 发生错误时回滚 myLogging.warning('update rollback; maybe exists, err: %s', traceback.format_exc()) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error('rollback error : ' + bookObj['rawUrl']) if u'完结' == bookObj['bookType']: updateBookTypeByRawUrl(bookObj['bookType'], bookObj['rawUrl']) # return None #有bug if not allowUpdate: return None
def crawlByDailySearchHistory(timeStart=None): myLogging.info('timeStart: %s', timeStart) baseUrl = 'http://%s/log/_search' % SEARCHHOST if not timeStart: timeStart = int(time.time() * 1000) - 24 * 3600 * 1000 searchInput = ''' { "size":0, "query": { "bool":{ "must":[{ "range" : { "page" : { "gte" : 1 } } }, { "range" : { "timestamp" : { "gte" : %s } } } ] } }, "aggs":{ "hist": { "terms": { "field": "word.raw", "size": 1000, "order": { "_count": "desc" } } } } } ''' % (str(timeStart)) r = requests.post(baseUrl, data=searchInput) resObj = json.loads(r.text) for wordObj in resObj['aggregations']['hist']['buckets']: word = wordObj['key'] searchAndCrawl(word)
def updateContentById(id, content): conn, csor = getDushuConnCsor() # sql = "update cn_dushu_acticle set content = %s where id = %s " % (content, str(id)) try: csor.execute("update cn_dushu_acticle set content = %s where id = %s ", (content, id)) conn.commit() myLogging.info(str(id) + ' succ cap, ' + content[0:15]) except Exception as e: # # 发生错误时回滚 myLogging.error(e) if conn: try: conn.rollback() except Exception as ee: myLogging.error(ee) csor.close() conn.close()
def mianfeiUpdateByBookObj(bookObj, maxChapNum=0): mid = bookObj['source'] newBookObj, newChapNum = crawlCurrentBookObj(mid) if not newBookObj: myLogging.error( 'mid %s with dbId %s get None currentBookObj, plz check', mid, bookObj['id']) return latestCapIndex = newBookObj['latestCapIndex'] newChapNum = max(newChapNum, latestCapIndex, maxChapNum) if newChapNum >= bookObj['chapterNum']: resIdx = handleCapsByBookObj(allowUpdate=True, bookObj=bookObj, count=newChapNum, mid=mid, startCapIdx=bookObj['chapterNum']) if resIdx > bookObj['chapterNum']: updateOneFieldByOneField('chapterNum', resIdx, 'id', bookObj['id']) updateBoostWithUpdateTime(bookObj['id']) myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' \ + str(resIdx - bookObj['chapterNum']) + ' chaps (mianTxt) ') if u'连载' != newBookObj['bookType']: updateOneFieldByOneField('bookType', newBookObj['bookType'], 'id', bookObj['id']) myLogging.info(newBookObj['title'].encode('utf-8') + newBookObj['bookType']) else: myLogging.info(newBookObj['title'].encode('utf-8') + ' no update (mianTxt)')
def updateByBookObj(bookObj): source = int(bookObj['source'].replace('shuqi', '')) newBookObj, digest = getBookObjFromSQid(source) if not newBookObj: # delBookById(bookObj['id']) myLogging.error( 'shuqi book has been droped, plz consider to delete id: ' + str(bookObj['id']) + ' sid: ' + str(source)) return if newBookObj['chapterNum'] > bookObj['chapterNum']: newBookObj['id'] = bookObj['id'] newChapNum = crawlCapsWithBookObj(bookObj=newBookObj, bookId=source, allowUpdate=True) if newChapNum >= bookObj['chapterNum']: updateOneFieldByOneField('chapterNum', newChapNum, 'id', bookObj['id']) updateBoostWithUpdateTime(bookObj['id']) myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' + str(newChapNum - bookObj['chapterNum'])\ + ' chaps ') if u'连载' != newBookObj['bookType']: updateOneFieldByOneField('bookType', newBookObj['bookType'], 'id', bookObj['id']) myLogging.warning(newBookObj['title'].encode('utf-8') + newBookObj['bookType'].encode('utf-8')) else: myLogging.info(newBookObj['title'].encode('utf-8') + ' has unexcepted, please check. didnot update ') else: myLogging.info(newBookObj['title'].encode('utf-8') + ' no update ()')
def handleChapsByBookObj(bookObj, zid, allowUpdate=False): # zid = bookObj['source'] bocObjs = getBocObjsByZid(zid) sourceCount = 0 for bocIdx in range(0, len(bocObjs)): bocObj = bocObjs[bocIdx] bocId = bocObj['_id'] try: bocSource = bocObj['source'] if 'zhuishuvip' == bocSource: continue bookObj['source'] = zid + '/' + bocId bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str( zid) + "?source=" + str(bocId) chapListObj = getChapsByBocId(bocId) bookObj['chapterNum'] = min(bookObj['chapterNum'], len(chapListObj['chapters'])) if bookObj['chapterNum'] <= MINCHAPNUM: continue bookObj = insertBookWithConn(bookObj, allowUpdate) resInx = handlChapsByBookObjZidBocId(bookObj, zid, chapListObj, allowUpdate) if resInx <= MINCHAPNUM: myLogging.info( 'zid %s dbid %s crawl too small chapNum, delete ', zid, bookObj['id']) delBookById(bookObj['id']) sourceCount += 1 if sourceCount >= sourceLimit: myLogging.info('zid: %s crawl source to sourceLimit', zid) break else: # bookObj['rawUrl'] = ZSSQBOOKINFOBASEURL + str(zid) + "?source=" + str(bocId) # bookObj = parseInsertBook(allowUpdate, bookObj, zid) #重新插入另外一个源的书 myLogging.info('zid: %s crawl another source %s', zid, bocId) except Exception as e: myLogging.error('zid: %s ,bocId %s get exception ', zid, bocId) myLogging.error(traceback.format_exc())
def getQichachaDigests(): idbloom = loadBloomFromFile('local/qichachaUIDs') if idbloom: myLogging.info('load bloom from file succ, no need load from db') # return idbloom else: myLogging.info('no dump bloom file, load from db') idbloom = getBloom(2000 * 10000) # idbloom = getBloom() conn, csor = getComConnCsor() csor.execute('select id from com_base_copy') # csor.execute('select id from com_base_copy limit 10') ids = csor.fetchall() [idbloom.add(mid[0]) for mid in ids] # if ids[0][0] in idbloom: myLogging.info('load exists ids ok, generate dump bloom file') dumpBloomToFile(idbloom, fileName='local/qichachaUIDs') return idbloom
def output(self): myLogging.info('mianfeiTXT output')
#!/usr/bin/python # -*- coding: UTF-8 -*- ''' @author: zyq ''' import sys # from app.mianfeiTXTNewFilder import findByIdRange import time from app.shuqi import start from app.shuqiNewFilder import updateFromMysql from local.hotConfigHelper import getHotConfigDict from util.logHelper import myLogging if __name__ == '__main__': # start('10650', allowUpdate=False) st = 50000 end = 500000 if len(sys.argv) > 1: st = int(sys.argv[1]) end = int(sys.argv[2]) updateFromMysql(st, end) sleepTime = getHotConfigDict()['shuqiNewFinder']['updateSleep'] myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs') time.sleep(int(sleepTime))
#!/usr/bin/python # -*- coding: UTF-8 -*- ''' @author: zyq ''' import time from app.mianfeiTXTUpdater import mianfeiTxtUpdateFromMysql from local.hotConfigHelper import getHotConfigDict from util.logHelper import myLogging if __name__ == '__main__': while 1: myLogging.info('begin mianfeiTXT updater') mianfeiTxtUpdateFromMysql() sleepTime = getHotConfigDict()['mianFeiTXTUpdater']['updateSleep'] myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs') time.sleep(int(sleepTime))
#!/usr/bin/python # -*- coding: UTF-8 -*- ''' @author: zyq ''' import time from app.zssqUpdater import updateFromMysql from local.hotConfigHelper import getHotConfigDict from util.logHelper import myLogging # import logging # logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',level=logging.INFO) if __name__ == '__main__': while 1: myLogging.info('begin zssq updater') updateFromMysql() sleepTime = getHotConfigDict()['zssqUpdater']['updateSleep'] myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs') time.sleep(int(sleepTime))
def output(self): myLogging.info('shuqi output')
def crawl(self): myLogging.info('mianfeiTXT crawl') handleByMTID(self.mid)
def crawl(self): myLogging.info('shuqi init') start(self.sid)
def handleCapsByBookObj(allowUpdate, bookObj, count, mid, startCapIdx = 1): capIdxs = set() if allowUpdate: capIdxs = getCapIdxsByBookId(bookObj['id']) # 已在库中的章节下标 # myBookId = bookObj['id'] # # startCap = time.time() crawlParseSpent = 0 insertCap = 0 uploadCap = 0 succCapTimes = 1 resIdx = startCapIdx for cid in range(0, count + 1): try: if allowUpdate: if cid in capIdxs: continue # 该章节已在库中,跳过 # else: # startCap = time.time() befCrawl = time.time() succCapTimes = succCapTimes + 1 # capContentUrl = MianFeiContentBaseUrl + str(cid) + '&contentid=' + str(mid) capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(bookObj['source']).mChapId( cid).mianfeiTXTSign().toUrl() capContent = getContentWithUA(capContentUrl, ua) if not capContent: capContent = getContentWithUA(capContentUrl, ua) # capContent = capContent.replace(r'\r', '').replace(r'\n', '') capListJsonObj = json.loads(capContent, strict=False) if not (capListJsonObj['returnCode'] == '0000'): capListJsonObj = json.loads(capContent) if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'): resIdx = min(cid, resIdx) myLogging.info('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid)) return resIdx # 原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出 capObj = dict() orgContent = capListJsonObj['data']['bookChapter']['content'] contentSoup = getSoupByStr(orgContent) if not contentSoup or '' == orgContent or len(orgContent) < 1: myLogging.error('chap content null ,RETURN, capId:' + str(cid) + ' mid: ' + str(mid)) resIdx = min(cid, resIdx) return resIdx #原api接口更新不及时,为了配合后来的 无限向前重试方法,在这跳出 if contentSoup.body['style']: del contentSoup.body['style'] content = unicode(contentSoup.body).replace(u'<body>', '').replace(u'</body>', '').replace(u'\n\n', u'\n').replace( u'<br><br>', u'<br>').replace(u'<br\><br\>', u'<br\>') capObj['content'] = textClean(content) capObj['title'] = unicode(contentSoup.title.get_text()) capObj['rawUrl'] = capContentUrl # capObj['size'] = int(WordsCount) capObj['size'] = len(content) capObj['bookId'] = bookObj['id'] capObj['source'] = bookObj['source'] capObj['idx'] = cid capObj['bookUUID'] = bookObj['digest'] digest = getCapDigest(bookObj, capObj, cid) capObj['digest'] = digest befInsertCap = time.time() crawlParseSpent = crawlParseSpent + (befInsertCap - befCrawl) capId = insertCapWithCapObj(capObj) aftInsertCap = time.time() insertCap = insertCap + (aftInsertCap - befInsertCap) if not capId: continue uploadJson2Bucket(str(capObj['id']) + '.json', json.dumps(capObj)) aftUploadCap = time.time() uploadCap = uploadCap + (aftUploadCap - aftInsertCap) resIdx = max(cid, resIdx) except Exception as e: myLogging.error('crawl' + str(mid) + ' cap ' + str(cid) + ' exception: ' + str(e)) resIdx = min(cid, resIdx) if succCapTimes > 1: succCapTimes = succCapTimes - 1 myLogging.info( 'crawlParse avg: ' + str(float(crawlParseSpent) / float(succCapTimes)) + \ ' insert avg: ' + str(float(insertCap) / float(succCapTimes)) + \ ' upload avg: ' + str(float(uploadCap) / float(succCapTimes))) return resIdx
def changeSouceIds(): bookObjs = getMianAllBookBaseObjs() for bookObj in bookObjs: try: foundNewId = False title = bookObj['title'] author = bookObj['author'] source = bookObj['source'] bookId = bookObj['id'] searchUrl = MianFeiTXTSearchBaseUrl + '?' + paramMap().mianfeiTXT()\ .put('keyword', (title + author).encode('utf-8'))\ .put('pageSize', '10').put('pageNum', '1').put('type', '1')\ .mianfeiTXTSign() \ .toUrl() # time.sleep(random.) r = requests.get(searchUrl) searchRes = json.loads(r.text) for resBook in searchRes['data']['books']: resTitle = resBook['name'] if resTitle != title: continue resAuthor = resBook['author'] if resAuthor != author: continue resId = resBook['id'] if str(resId) == str(source): myLogging.info('WTF: id no change?, bookId: %s, orgSoueceId: %s, newId: %s', bookId, source, resId) latestChapObj = getLatestChapByBookId(bookId) if not latestChapObj: myLogging.error('no chaps in db yet, bookId: %s, new mid: %s', bookId, resId) updateOneFieldByOneField('source', resId, 'id', bookId) foundNewId = True break cid = latestChapObj['idx'] chapTitle = latestChapObj['title'] capContentUrl = MianFeiTXTChapBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(resId).mChapId( cid).mianfeiTXTSign().toUrl() capContent = getContentWithUA(capContentUrl) if not capContent: capContent = getContentWithUA(capContentUrl) # capContent = capContent.replace(r'\r', '').replace(r'\n', '') capListJsonObj = json.loads(capContent, strict=False) if not (capListJsonObj['returnCode'] == '0000'): capListJsonObj = json.loads(capContent) if not (capListJsonObj['returnCode'] == '0000' and capListJsonObj['returnMsg'] == u'成功'): myLogging.error('get chap detail fail mid: %s, cid: %s', resId, cid) continue chapterName = capListJsonObj['data']['bookChapter']['chapterName'] if chapterName == chapTitle: myLogging.info('bookId %s change source from %s to %s', bookId, source, resId) updateOneFieldByOneField('source', resId, 'id', bookId) foundNewId = True break if not foundNewId: myLogging.error('bookId %s did not find new id !!!,title: %s, author: %s, org source: %s', bookId, title, author,source ) except Exception as e: myLogging.error(traceback.format_exc())
#!/usr/bin/python # -*- coding: UTF-8 -*- ''' @author: zyq ''' import time import traceback from app.SearchHistoryCrawler import crawlByDailySearchHistory from app.shuqiUpdater import updateFromMysql from local.hotConfigHelper import getHotConfigDict from util.logHelper import myLogging # import logging # logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',level=logging.INFO) if __name__ == '__main__': timeStart = int(time.time() * 1000) - 24 * 3600 * 1000 while 1: myLogging.info('begin searchHistoryCrawler') timeBeforeSearch = int(time.time() * 1000) try: crawlByDailySearchHistory(timeStart) except Exception as e: myLogging.error(traceback.format_exc()) timeStart = timeBeforeSearch sleepTime = getHotConfigDict()['searchHistoryCrawler']['updateSleep'] myLogging.info(' done one loop, now sleep ' + str(sleepTime) + ' secs') time.sleep(int(sleepTime))