def main(coreNum, itemsNum): mCWCrawler = CWKCrawler() url = 'http://wk.baidu.com' sql = 'update resources set content="%s",flag=2 where resource_id=%d' db = connectDb() while True: print 'Read resources from database.' resources = readResourceHrefs(db, itemsNum) if not resources: fp = open('flag.txt', 'r') flag = cPickle.load(fp) fp.close() if flag == 0: break else: print 'Sleep: ', time.localtime() db.CloseDb() time.sleep(600) print 'Wake up:', time.localtime() db = connectDb() continue pool = Pool(processes=coreNum) results = {} for i in range(len(resources)): results[i] = pool.apply_async(mCWCrawler.getContentBS, (url+resources[i]['href'], )) pool.close() pool.join() for i in results: print resources[i]['resource_id'] wkPage = results[i].get() if not wkPage: continue db.UpdateTb(sql %(wkPage['content'], resources[i]['resource_id'])) db.CloseDb()
def main(itemsNum): db = connectDb() mCWebRender = CWebRender() mCWenkuPage = CWenkuPage() url = 'http://wenku.baidu.com' insertSQL = 'insert into comment(user_id, resource_id, score, time) values(%s, %s, %s, %s)' while True: resources = getResources(db, itemsNum) if not resources: fp = open('flag.txt', 'r') flag = cPickle.load(fp) fp.close() if flag == 0: break else: print 'Sleep: ', time.localtime() db.CloseDb() time.sleep(600) print 'Wake up:', time.localtime() db = connectDb() continue for resource in resources: print resource['resource_id'] flag = mCWenkuPage.getPage(url+resource['href'], mCWebRender) if not flag: continue users = mCWenkuPage.getAllComment(mCWebRender) updateSQL = 'update resources set flag=4 where resource_id=%d' % resource['resource_id'] db.UpdateTb(updateSQL) values = [(user[0], resource['resource_id'], user[1], user[2]) for user in users] db.InsertTb(insertSQL, values) mCWebRender.closeUrl() db.CloseDb()
def main(coreNum, itemsNum): mCWCrawler = CWKCrawler() url = 'http://wk.baidu.com' sql = 'update resources set content="%s",flag=2 where resource_id=%d' db = connectDb() while True: print 'Read resources from database.' resources = readResourceHrefs(db, itemsNum) if not resources: fp = open('flag.txt', 'r') flag = cPickle.load(fp) fp.close() if flag == 0: break else: print 'Sleep: ', time.localtime() db.CloseDb() time.sleep(600) print 'Wake up:', time.localtime() db = connectDb() continue pool = Pool(processes=coreNum) results = {} for i in range(len(resources)): results[i] = pool.apply_async(mCWCrawler.getContentBS, (url + resources[i]['href'], )) pool.close() pool.join() for i in results: print resources[i]['resource_id'] wkPage = results[i].get() if not wkPage: continue db.UpdateTb(sql % (wkPage['content'], resources[i]['resource_id'])) db.CloseDb()
def main(coreNum, itemsNum): db = connectDb() flag = 1 fp = open('flag.txt', 'w') cPickle.dump(flag, fp) fp.close() while True: lessons = getLessons(db, itemsNum) if not lessons: break InsertSQL = 'insert into resources(lesson_id, title, href, type) values(%s, %s, %s, %s)' UpdateSQL = 'update lessons set flag = %d where lesson_id=%d' for i in range(len(lessons)): print 'Lesson ID: ', lessons[0]['book_id'] sql = 'select href from books where book_id=%d' % lessons[0][ 'book_id'] books = db.InquiryTb(sql) url = 'http://wenku.baidu.com' + books[0]['href'] + lessons[0][ 'href'] values = readHrefLesson(lessons[i]['lesson_id'], url) db.InsertTb(InsertSQL, values) db.UpdateTb(UpdateSQL % (1, lessons[0]['lesson_id'])) flag = 0 fp = open('flag.txt', 'w') cPickle.dump(flag, fp) fp.close() db.CloseDb()
def main(coreNum): # mCPoolFunc = CPoolFunc(coreNum) # mCPoolFunc.go() # db = connectDb() flag = 1 while True: flag = poolFunc(db) pool = Pool(processes=coreNum) results = {} for i in xrange(10000): results[i] = pool.apply_async(poolFunc, args=(db,)) pool.close() pool.join() for i in results: temp = results[i].get() if not temp: flag = 0 break if flag == 0: break db.CloseDb()
def main(coreNum): # mCPoolFunc = CPoolFunc(coreNum) # mCPoolFunc.go() # db = connectDb() flag = 1 while True: flag = poolFunc(db) pool = Pool(processes=coreNum) results = {} for i in xrange(10000): results[i] = pool.apply_async(poolFunc, args=(db, )) pool.close() pool.join() for i in results: temp = results[i].get() if not temp: flag = 0 break if flag == 0: break db.CloseDb()
def test(itemsNum): mCWCrawler = CWKCrawler() url = 'http://wk.baidu.com' db = connectDb() sql = 'update resources set content="%s",flag=2 where resource_id=%d' while True: resources = readResourceHrefs(db, itemsNum) if not resources: break for i in range(len(resources)): wkPage = mCWCrawler.getContentBS(url+resources[i]['href'], ) db.UpdateTb(sql %(wkPage['content'], resources[i]['resource_id'])) db.CloseDb()
def test(itemsNum): mCWCrawler = CWKCrawler() url = 'http://wk.baidu.com' db = connectDb() sql = 'update resources set content="%s",flag=2 where resource_id=%d' while True: resources = readResourceHrefs(db, itemsNum) if not resources: break for i in range(len(resources)): wkPage = mCWCrawler.getContentBS(url + resources[i]['href'], ) db.UpdateTb(sql % (wkPage['content'], resources[i]['resource_id'])) db.CloseDb()
def main(): #url = 'http://wenku.baidu.com/portal/subject/8_s0_g0_v0' #小学 #url = 'http://wenku.baidu.com/portal/subject/9_s0_g0_v0' #初中 url = 'http://wenku.baidu.com/portal/subject/31_s0_g0_v0' #高中 soup = getSoup(url) bookSoups = getBookSoups(soup) if not bookSoups: return False subjectSoup = bookSoups[0] subjectHrefs = getSoupHrefs(subjectSoup) db = connectDb() bookSQL = 'insert into books(subject, version, grade, href) values(%s, %s, %s, %s)' lessonSQL = 'insert into lessons(book_id, title, href, flag) values(%s, %s, %s, %s)' url = 'http://wenku.baidu.com' for subject_href in subjectHrefs: soup = getSoup(url + subject_href) bookSoups = getBookSoups(soup) if not bookSoups: continue versionHrefs = getSoupHrefs(bookSoups[1]) for version_href in versionHrefs: soup = getSoup(url + version_href) bookSoups = getBookSoups(soup) if not bookSoups: continue gradeHrefs = getSoupHrefs(bookSoups[2]) for grade_href in gradeHrefs: book_href = url + grade_href soup = getSoup(book_href) book_name = getSelectBook(soup) lessons = getLessons(soup) print '#' * 100 if not book_name: continue book_id = db.InsertTb( bookSQL, [(book_name[0], book_name[1], book_name[2], grade_href)]) print book_href, '_'.join(book_name), len(lessons) values = [] for title, href in lessons: print ' ' * 3, title, book_href + href values.append((book_id, title, href, 0)) db.InsertTb(lessonSQL, values) db.CloseDb() print 'Over!'