def updateByBookObj(bookObj): source = int(bookObj['source'].replace('shuqi', '')) newBookObj, digest = getBookObjFromSQid(source) if not newBookObj: # delBookById(bookObj['id']) myLogging.error( 'shuqi book has been droped, plz consider to delete id: ' + str(bookObj['id']) + ' sid: ' + str(source)) return if newBookObj['chapterNum'] > bookObj['chapterNum']: newBookObj['id'] = bookObj['id'] newChapNum = crawlCapsWithBookObj(bookObj=newBookObj, bookId=source, allowUpdate=True) if newChapNum >= bookObj['chapterNum']: updateOneFieldByOneField('chapterNum', newChapNum, 'id', bookObj['id']) updateBoostWithUpdateTime(bookObj['id']) myLogging.info( newBookObj['title'].encode('utf-8') + ' update ' + str(newChapNum - bookObj['chapterNum'])\ + ' chaps ') if u'连载' != newBookObj['bookType']: updateOneFieldByOneField('bookType', newBookObj['bookType'], 'id', bookObj['id']) myLogging.warning(newBookObj['title'].encode('utf-8') + newBookObj['bookType'].encode('utf-8')) else: myLogging.info(newBookObj['title'].encode('utf-8') + ' has unexcepted, please check. didnot update ') else: myLogging.info(newBookObj['title'].encode('utf-8') + ' no update ()')
def cleanSubtitle(): conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) bookId = 2584584 carry = 50000 while bookId < 2590000: try: dictCsor.execute( 'select id,subtitle from ' + db_dushu + " where id >= %s and id <= %s and subtitle REGEXP '[0-9]{5,20}'", (bookId, bookId + carry)) conn.commit() books = dictCsor.fetchallDict() for book in books: newSubtitle = subTitleClean(book['subtitle']) if not newSubtitle == book['subtitle'].encode('utf-8'): myLogging.info('bookId %s update from %s to %s', book['id'], book['subtitle'].encode('utf-8'), newSubtitle) updateOneFieldByOneField('subtitle', newSubtitle, 'id', book['id']) except Exception as e: myLogging.warning(e) bookId += carry chapObj = dictCsor.fetchoneDict() csor.close() conn.close()
def getLatestUpdateBooks(categorys, limit=30): ''' 按bookId和title获取章节信息对象 :param bookId: :param idx: :return: ''' conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) try: dictCsor.execute( 'select id from ' + db_dushu + " where categoryCode in %s " "and imgUrl != 'http://tata-img.oss-cn-shanghai.aliyuncs.com/book-default.jpg' " " order by updateTime desc limit %s", (categorys, limit)) conn.commit() except Exception as e: myLogging.warning(e) chapObj = dictCsor.fetchallDict() csor.close() conn.close() return chapObj
def updateOneFieldByOneField(upFieldName, upFieldValue, byFieldName, byFieldValue): conn, csor = getDushuConnCsor() try: csor.execute( "update " + db_dushu + " set " + upFieldName + " = %s, updateTime = " + str(int(time.time())) + " where " + byFieldName + " = %s", (upFieldValue, byFieldValue)) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) csor.close() conn.close()
def updateBookTypeByRawUrl(type, rawUrl): conn, csor = getDushuConnCsor() try: csor.execute( "update " + db_dushu + " set bookType = %s where rawUrl = %s", ( type, rawUrl, )) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) csor.close() conn.close()
def getIdsByType(confType): conn, csor = getDushuConnCsor() try: csor.execute("select ids from " + db_typeBook + " where type = %s", (confType, )) conn.commit() except Exception as e: myLogging.warning('get bookType exception: ' + str(e)) ids = csor.fetchone()[0] csor.close() conn.close() return ids
def getLatestChapByBookId(bookId): conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) try: dictCsor.execute( "select * from " + db_acticle + " where bookId = %s order by id desc limit 1;", (bookId, )) conn.commit() except Exception as e: myLogging.warning('getLatestChapByBookId exception: ' + str(e)) bookObj = dictCsor.fetchoneDict() csor.close() conn.close() return bookObj
def fromInvestInt(): global conn, csor if not conn or (not csor): conn, csor = getComConnCsor() csor.execute( "select id,companyName from com_base_copy where id = '6bc7e7ccdb755391651316a0227c059b' and companyName is not Null limit 10;" ) result = csor.fetchall() for comInfo in result: uid = comInfo[0] cName = comInfo[1] if not cName: myLogging.warning('no comName skip, uid: %s', uid) continue getInvestListByNameId(uid, cName)
def getBookCount(): ''' 获取图书总数 :param dbid: :return: ''' conn, csor = getDushuConnCsor() try: csor.execute("select count(*) from " + db_dushu) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) count = csor.fetchone()[0] csor.close() conn.close() return count
def insertBookWithConn(bookObj, allowUpdate=True, conn2=None, csor2=None): if not conn2 or not csor2: conn2, csor2 = getDushuConnCsor() userId = random.randint(1, 50) updateTime = int(time.time()) digest = getBookDigest(bookObj) bookObj['digest'] = digest #统一清理操作 bookObj['subtitle'] = subTitleClean(bookObj['subtitle']) if not bookObj.has_key('source'): bookObj['source'] = '' try: csor2.execute('insert ' + db_dushu + '(categoryCode,typeCode,category,type,userId,title,subtitle,imgUrl,author,updateTime' \ ",rawUrl,source,digest,status,viewNum, chapterNum, bookType, size) values" \ "(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)" \ , (bookObj['categoryCode'],bookObj['typeCode'], bookObj['category'], bookObj['type'], userId,bookObj['title'] ,bookObj['subtitle'],bookObj['imgUrl'],bookObj['author'],updateTime, bookObj['rawUrl'] ,bookObj['source'],digest, 11,bookObj['viewNum'],bookObj['chapterNum'],bookObj['bookType'],bookObj['size'])) # csorDoc.execute('update cn_dushu_book set subtitle = %s where digest = %s' # , (bookObj['subtitle'],digest)) conn2.commit() myLogging.info('succ book, ' + unicode(bookObj['title']).encode('utf-8')) except Exception, e: # # 发生错误时回滚 myLogging.warning('update rollback; maybe exists, err: %s', traceback.format_exc()) if conn2: try: conn2.rollback() except Exception as ee: myLogging.error('rollback error : ' + bookObj['rawUrl']) if u'完结' == bookObj['bookType']: updateBookTypeByRawUrl(bookObj['bookType'], bookObj['rawUrl']) # return None #有bug if not allowUpdate: return None
def updateIdsByType(confType, ids): conn, csor = getDushuConnCsor() try: csor.execute( "update " + db_typeBook + ' set ids = %s where type = %s', (ids, confType)) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) csor.close() conn.close() # return ids # if __name__ == '__main__': # delBookById(227921)
def deleteChapsLargerThanIdx(bookId, idx): ''' 删除章节表中所有大于此idx的 :param bookId: :param idx: :return: ''' conn, csor = getDushuConnCsor() try: csor.execute( 'delete from ' + db_acticle + " where bookId = %s and idx > %s", (bookId, idx)) conn.commit() except Exception as e: myLogging.warning(e) csor.close() conn.close()
def getBookObjById(dbid): ''' 更加库中主键id获取book对象 :param dbid: :return: ''' conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) try: dictCsor.execute("select * from " + db_dushu + " where id = %s", (dbid, )) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) bookObj = dictCsor.fetchoneDict() csor.close() conn.close() return bookObj
def parseBook(allowUpdate, bookObj, zid): # categDict = shuqCategory zssqStaticUrl = 'http://statics.zhuishushenqi.com/' bookObj['zid'] = bookObj['_id'] bookObj['imgUrl'] = urlparse.urljoin(zssqStaticUrl, bookObj['cover']) bookObj['category'] = '其他' if bookObj.has_key('majorCate'): bookObj['category'] = bookObj['majorCate'] # bookObj['categoryCode'] = getClassifyCodeByName(bookObj['category'])['categoryCode'] bookObj['type'] = '其他' if bookObj.has_key('minorCate'): bookObj['type'] = bookObj['minorCate'] # bookObj['type'] = bookObj['minorCate'] bookObj['typeCode'] = 0 # classfyObj = getClassifyCodeByName(bookObj['type']) # if 0 != classfyObj['typeCode']:#二级分类命中的话 一级分类也可以更新掉了 # bookObj['typeCode'] = classfyObj['typeCode'] # bookObj['categoryCode'] = classfyObj['categoryCode'] bookObj['categoryCode'], bookObj['typeCode'], bookObj[ 'category'] = getCategoryAndTypeCode(bookObj['category'], bookObj['type']) bookObj['size'] = bookObj['wordCount'] bookObj['chapterNum'] = bookObj['chaptersCount'] if bookObj['chapterNum'] < MINCHAPNUM: myLogging.warning('chapNum too small, skip %s, return', str(zid)) return None bookObj['subtitle'] = bookObj['longIntro'] bookObj['viewNum'] = int(bookObj['latelyFollower']) * 9 if bookObj['isSerial']: bookObj['bookType'] = '连载' else: bookObj['bookType'] = '完结' return bookObj
def getCountDuring(timeStart, timeEnd): ''' 获取图书总数 :param dbid: :return: ''' conn, csor = getDushuConnCsor() try: csor.execute( "select count(*) from " + db_dushu + " where updateTime > %s and updateTime < %s", (timeStart, timeEnd)) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) count = csor.fetchone()[0] csor.close() conn.close() return count
def getInvestListByNameId(quid, qCname): cookies = {'PHPSESSID': '5dplss3psrev57ad4jk637jph4'} if quid in investBloom: myLogging.warning('invest aready done before, uid: %s', quid) return None url = 'http://www.qichacha.com/company_getinfos?unique=' + quid + '&companyname=' + quote( qCname.encode('utf-8')) + '&tab=touzi' # url = 'http://www.qichacha.com/company_touzi?unique=' + quid + '&companyname=' + quote(qCname.encode('utf-8')) resList = [] while 1: htmlContent = getQichachaHtml(url, cookies=cookies) soup = getSoupByStrEncode(htmlContent) for uidTag in soup.select_one('.list-group-item'): uid = uidTag['href'].replace('firm_', '').replace('.shtml', '').replace('/', '') prv = None if '_' in uid: strs = uid.split('_') prv = strs[0] uid = strs[1] comName = uidTag.select_one('.text-lg').get_text() comObj = dict() comObj['uid'] = uid comObj['comName'] = comName insertWithUid(conn, csor, prv, quid) getInvestListByNameId(uid, comName) #递归下去 resList.append(comObj) # insertWithUid(conn,csor,None,quid) #入库 if len(resList) < 1: #没有投资记录 insertInvestList(quid, '') return resList
def updateBoostWithUpdateTime(dbid): ''' 根据库中主键id获取book对象 :param dbid: :return: ''' conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) try: dictCsor.execute( "update " + db_dushu + " set typeBoost = updateTime where id = %s", (dbid, )) conn.commit() except Exception as e: myLogging.warning('update bookType exception: ' + str(e)) bookObj = dictCsor.fetchoneDict() csor.close() conn.close() return bookObj
def searchAndCrawlByName(comName, proxy=None): if not comName: return None comName = comName.encode('utf-8') # baseUrl = 'http://www.qichacha.com/search?key=' + quote(comName) # baseUrl = 'http://www.qichacha.com/firm_CN_ea3a783f0c010fc31a2d75c2c9aa9b75' baseUrl = 'http://www.qichacha.com/search?key=%E5%B0%8F%E7%B1%B3' ua = random.choice(USER_AGENTS) htmlContent = getQichachaHtml(baseUrl, noCookie=True) if not htmlContent: return None soup = getSoupByStrEncode(htmlContent) if not soup.select('ul.list-group a') or len( soup.select('ul.list-group a')) < 1: myLogging.debug(htmlContent) return None for uidTag in soup.select('ul.list-group a'): uid = uidTag['href'].replace('firm_', '') if uid == uidTag['href']: myLogging.warning('not uid, skip %s', uidTag['href']) continue uid = uid.replace('.shtml', '').replace('/', '') prv = None if '_' in uid: strs = uid.split('_') prv = strs[0] uid = strs[1] # comName = uidTag.select_one('.text-lg').get_text() # comObj = dict() # comObj['uid'] = uid # comObj['comName'] = comName try: insertWithUid(conn, csor, prv, uid) except Exception as e: myLogging.error('insert with uid fail, uid: %s', uid) # print comLink return 'ok'
def getExistsCapsRawUrlId(bookId): conn, csor = getDushuConnCsor() checkCapsSql = 'select id,rawUrl from cn_dushu_acticle where bookId = %d' % ( bookId) try: csor.execute(checkCapsSql) conn.commit() results = csor.fetchall() if not results or len(results) < 1: myLogging.warning('no caps,, bookId:' + str(bookId)) return None else: return results except Exception as e: # # 发生错误时回滚 myLogging.error(e) csor.close() conn.close()
def handleWebsiteNoise(begin, end): conn2, csor2 = getDushuConnCsor() sql = 'select id,content from cn_dushu_acticle where bookId = 960 and id > ' + str( begin) + ' and id < ' + str(end) try: csor2.execute(sql) conn2.commit() except Exception as e: # # 发生错误时回滚 myLogging.warning(e) res = csor2.fetchall() for cap in res: id = cap[0] content = cap[1] content = re.sub(u'www.{0,15}com', "", content.lower()) content = re.sub(u'wwww.{0,15}c.{1,2}м', "", content) updateContentById(id, content) csor2.close() conn2.close()
def getChapObjByBookIdChapTitle(bookId, title): ''' 按bookId和title获取章节信息对象 :param bookId: :param idx: :return: ''' conn, csor = getDushuConnCsor() dictCsor = conn.cursor(MySQLdb.cursors.DictCursor) try: dictCsor.execute( 'select * from ' + db_acticle + " where bookId = %s and title = %s", (bookId, title)) conn.commit() except Exception as e: myLogging.warning(e) chapObj = dictCsor.fetchoneDict() csor.close() conn.close() return chapObj
def updateByBookObj(bookObj): latestChapObj = getLatestChapByBookId(bookObj['id']) chapName = '' chapIdx = 0 if latestChapObj: chapName = latestChapObj['title'] chapIdx = latestChapObj['idx'] source = bookObj['source'] checkUpdateUrl = checkUpdateBaseUrl % source payload = { 'client_chapter_name': chapName.encode('utf-8'), 'client_bookmark_name': chapName.encode('utf-8'), 'client_chapter_count': int(chapIdx), 'client_bookmark_count': int(chapIdx) } headers = { u'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 5.1; M3s Build/LMY47I)', u'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' } r = requests.post(checkUpdateUrl, data=payload, headers=headers) resp = r.text respJson = json.loads(resp) if not respJson['items'] or len( respJson['items']) < 1 or respJson['total'] < 1: myLogging.info('%s no update, skip', bookObj['id']) return resIdx = chapIdx chapTitles = getChapTitlesByBookId(bookObj['id']) chapIdxs = getCapIdxsByBookId(bookObj['id']) for chapObj in respJson['items']: # if chapObj['serial_number'] <= chapIdx: tempIdx = chapObj['serial_number'] tempTitle = chapObj['name'] if chapObj[ 'serial_number'] <= chapIdx and tempIdx in chapIdxs and tempTitle in chapTitles: continue try: rewChapIdx = handlChapByBookObjChapObj(chapObj=chapObj, bookObj=bookObj, allowUpdate=True) resIdx = max(resIdx, rewChapIdx) except Exception as e: myLogging.error('bookId %s chap idx %s has exception: %s', bookObj['id'], chapObj['serial_number'], traceback.format_exc()) if resIdx > bookObj['chapterNum']: updateOneFieldByOneField('chapterNum', resIdx, 'id', bookObj['id']) updateBoostWithUpdateTime(bookObj['id']) myLogging.info(str(bookObj['id']) + respJson['book']['name'].encode('utf-8') + ' update ' + str( resIdx - bookObj['chapterNum']) \ + ' chaps ') if u'serialize' == respJson['book']['status']: newStatus = u'连载' if u'FINISH' == respJson['book']['status']: newStatus = u'完结' updateOneFieldByOneField('bookType', newStatus, 'id', bookObj['id']) myLogging.warning(bookObj['title'].encode('utf-8') + newStatus.encode('utf-8')) else: myLogging.info( str(bookObj['id']) + ' has unexcepted, please check. didnot update ')
def crawlCurrentBookObj(mid): # url = MianFeiTXTBaseUrl + str(mid) url = MianFeiTXTBookBaseUrl + '?' + paramMap().mianfeiTXT().mBookId(mid).mianfeiTXTSign().toUrl() baseInfoContent = getContentWithUA(url, ua) if not baseInfoContent: baseInfoContent = getContentWithUA(url, ua) baseObj = json.loads(baseInfoContent) baseData = baseObj['data']['book'] author = baseData['author'] title = baseData['name'] coverUrl = baseData['coverUrl'] # contentUrl = baseData['contentUrl'] count = baseData['latestChapterCount'] #不准,更新不及时 if count < MINCHAPNUM: myLogging.warning( 'chapNum too small, skip %s, return', str(mid)) return None, None # isOver = baseData['isOver'] BookType = baseData['serialStatus'] # if isOver == 1: # BookType = '完结' # bookDetailHtml = getContentWithUA(MianFeiTXTBookDetailUrl + str(mid), ua) # bookDetailSoup = getSoupByStr(bookDetailHtml) # bookDesc = bookDetailSoup.select_one('#J-desc').get_text().replace('\n', '').replace('\t\t', '\t') # bookLabels = [] # for span in bookDetailSoup.select('#J-lables-items span'): # bookLabels.append(span.get_text()) bookObj = dict() bookObj['subtitle'] = baseData['summary'] bookObj['source'] = "" + str(mid) bookObj['rawUrl'] = MianFeiTXTBaseUrl + str(mid) bookObj['title'] = title bookObj['chapterNum'] = count #更新不及时 bookObj['imgUrl'] = 'http://oss-public.antehao.cn/' + coverUrl bookObj['author'] = author bookObj['size'] = baseData['words'] bookObj['category'] = baseData['secondCategory'] # if len(bookLabels) > 0: # bookObj['category'] = bookLabels[0] bookObj['type'] = baseData['thirdCategory'] # if len(bookLabels) > 0: # bookObj['type'] = bookLabels[0] # if len(bookLabels) > 1: # bookObj['type'] = bookLabels[1] bookObj['bookType'] = BookType bookObj['categoryCode'], bookObj['typeCode'], bookObj['category'] = getCategoryAndTypeCode(bookObj['category'], bookObj['type']) # bookObj['typeCode'] = 0 # bookObj['categoryCode'] = 0 bookObj['viewNum'] = random.randint(500000, 1000000) #获取最新章节下标,作为另一个判断更新的条件 bookObj['latestCapIndex'] = min(baseData['latestChapterId'], 200000) # try: # # capExamples = bookDetailSoup.select('.J-category-li') # if capExamples and len(capExamples) > 2: # bookObj['latestCapIndex'] = int(capExamples[2]['id'])#就要第三个,有时候共有3个,有时共有6个 # # except Exception : # myLogging.warning(traceback.format_exc()) return bookObj, count