def main(coreNum, itemsNum):
    mCWCrawler = CWKCrawler()
    url = 'http://wk.baidu.com'
    sql = 'update resources set content="%s",flag=2 where resource_id=%d'
    db = connectDb()
    while True:
        print 'Read resources from database.'
        resources = readResourceHrefs(db, itemsNum)
        if not resources:
            fp = open('flag.txt', 'r')
            flag = cPickle.load(fp)
            fp.close()
            if flag == 0:
                break
            else:
                print 'Sleep: ', time.localtime()
                db.CloseDb()
                time.sleep(600)
                print 'Wake up:', time.localtime()
                db = connectDb()
                continue
        pool = Pool(processes=coreNum)
        results = {}
        for i in range(len(resources)):
            results[i] = pool.apply_async(mCWCrawler.getContentBS, (url+resources[i]['href'], ))
        pool.close()
        pool.join()

        for i in results:
            print resources[i]['resource_id']
            wkPage = results[i].get()
            if not wkPage: continue
            db.UpdateTb(sql %(wkPage['content'], resources[i]['resource_id']))
    db.CloseDb()
示例#2
0
def main(itemsNum):
    db = connectDb()
    mCWebRender = CWebRender()
    mCWenkuPage = CWenkuPage()
    url = 'http://wenku.baidu.com'
    insertSQL = 'insert into comment(user_id, resource_id, score, time) values(%s, %s, %s, %s)'
    while True:
        resources = getResources(db, itemsNum)
        if not resources:
            fp = open('flag.txt', 'r')
            flag = cPickle.load(fp)
            fp.close()

            if flag == 0:
                break
            else:
                print 'Sleep: ', time.localtime()
                db.CloseDb()
                time.sleep(600)
                print 'Wake up:', time.localtime()
                db = connectDb()
                continue
        for resource in resources:
            print resource['resource_id']
            flag = mCWenkuPage.getPage(url+resource['href'], mCWebRender)
            if not flag:
                continue
            users = mCWenkuPage.getAllComment(mCWebRender)

            updateSQL = 'update resources set flag=4 where resource_id=%d' % resource['resource_id']
            db.UpdateTb(updateSQL)
            values = [(user[0], resource['resource_id'], user[1], user[2]) for user in users]
            db.InsertTb(insertSQL, values)
    mCWebRender.closeUrl()
    db.CloseDb()
def main(coreNum, itemsNum):
    mCWCrawler = CWKCrawler()
    url = 'http://wk.baidu.com'
    sql = 'update resources set content="%s",flag=2 where resource_id=%d'
    db = connectDb()
    while True:
        print 'Read resources from database.'
        resources = readResourceHrefs(db, itemsNum)
        if not resources:
            fp = open('flag.txt', 'r')
            flag = cPickle.load(fp)
            fp.close()
            if flag == 0:
                break
            else:
                print 'Sleep: ', time.localtime()
                db.CloseDb()
                time.sleep(600)
                print 'Wake up:', time.localtime()
                db = connectDb()
                continue
        pool = Pool(processes=coreNum)
        results = {}
        for i in range(len(resources)):
            results[i] = pool.apply_async(mCWCrawler.getContentBS,
                                          (url + resources[i]['href'], ))
        pool.close()
        pool.join()

        for i in results:
            print resources[i]['resource_id']
            wkPage = results[i].get()
            if not wkPage: continue
            db.UpdateTb(sql % (wkPage['content'], resources[i]['resource_id']))
    db.CloseDb()
def main(coreNum, itemsNum):
    db = connectDb()
    flag = 1
    fp = open('flag.txt', 'w')
    cPickle.dump(flag, fp)
    fp.close()
    while True:
        lessons = getLessons(db, itemsNum)
        if not lessons: break

        InsertSQL = 'insert into resources(lesson_id, title, href, type)  values(%s, %s, %s, %s)'
        UpdateSQL = 'update lessons set flag = %d where lesson_id=%d'
        for i in range(len(lessons)):
            print 'Lesson ID: ', lessons[0]['book_id']
            sql = 'select href from books where book_id=%d' % lessons[0][
                'book_id']
            books = db.InquiryTb(sql)
            url = 'http://wenku.baidu.com' + books[0]['href'] + lessons[0][
                'href']
            values = readHrefLesson(lessons[i]['lesson_id'], url)
            db.InsertTb(InsertSQL, values)
            db.UpdateTb(UpdateSQL % (1, lessons[0]['lesson_id']))
    flag = 0
    fp = open('flag.txt', 'w')
    cPickle.dump(flag, fp)
    fp.close()
    db.CloseDb()
示例#5
0
def main(coreNum):
    # mCPoolFunc = CPoolFunc(coreNum)
    # mCPoolFunc.go()
    #
    db = connectDb()
    flag = 1
    while True:
        flag = poolFunc(db)

        pool = Pool(processes=coreNum)
        results = {}
        for i in xrange(10000):
            results[i] = pool.apply_async(poolFunc, args=(db,))
        pool.close()
        pool.join()

        for i in results:
            temp = results[i].get()
            if not temp:
                flag = 0
                break
        if flag == 0:
            break

    db.CloseDb()
示例#6
0
def main(coreNum):
    # mCPoolFunc = CPoolFunc(coreNum)
    # mCPoolFunc.go()
    #
    db = connectDb()
    flag = 1
    while True:
        flag = poolFunc(db)

        pool = Pool(processes=coreNum)
        results = {}
        for i in xrange(10000):
            results[i] = pool.apply_async(poolFunc, args=(db, ))
        pool.close()
        pool.join()

        for i in results:
            temp = results[i].get()
            if not temp:
                flag = 0
                break
        if flag == 0:
            break

    db.CloseDb()
def test(itemsNum):
    mCWCrawler = CWKCrawler()
    url = 'http://wk.baidu.com'
    db = connectDb()
    sql = 'update resources set content="%s",flag=2 where resource_id=%d'
    while True:
        resources = readResourceHrefs(db, itemsNum)
        if not resources:
            break
        for i in range(len(resources)):
            wkPage = mCWCrawler.getContentBS(url+resources[i]['href'], )
            db.UpdateTb(sql %(wkPage['content'], resources[i]['resource_id']))
    db.CloseDb()
def test(itemsNum):
    mCWCrawler = CWKCrawler()
    url = 'http://wk.baidu.com'
    db = connectDb()
    sql = 'update resources set content="%s",flag=2 where resource_id=%d'
    while True:
        resources = readResourceHrefs(db, itemsNum)
        if not resources:
            break
        for i in range(len(resources)):
            wkPage = mCWCrawler.getContentBS(url + resources[i]['href'], )
            db.UpdateTb(sql % (wkPage['content'], resources[i]['resource_id']))
    db.CloseDb()
示例#9
0
def main():
    #url = 'http://wenku.baidu.com/portal/subject/8_s0_g0_v0'   #小学
    #url = 'http://wenku.baidu.com/portal/subject/9_s0_g0_v0'    #初中
    url = 'http://wenku.baidu.com/portal/subject/31_s0_g0_v0'  #高中
    soup = getSoup(url)
    bookSoups = getBookSoups(soup)
    if not bookSoups:
        return False
    subjectSoup = bookSoups[0]
    subjectHrefs = getSoupHrefs(subjectSoup)
    db = connectDb()
    bookSQL = 'insert into books(subject, version, grade, href) values(%s, %s, %s, %s)'
    lessonSQL = 'insert into lessons(book_id, title, href, flag) values(%s, %s, %s, %s)'
    url = 'http://wenku.baidu.com'
    for subject_href in subjectHrefs:
        soup = getSoup(url + subject_href)
        bookSoups = getBookSoups(soup)
        if not bookSoups: continue
        versionHrefs = getSoupHrefs(bookSoups[1])
        for version_href in versionHrefs:
            soup = getSoup(url + version_href)
            bookSoups = getBookSoups(soup)
            if not bookSoups: continue
            gradeHrefs = getSoupHrefs(bookSoups[2])
            for grade_href in gradeHrefs:
                book_href = url + grade_href
                soup = getSoup(book_href)
                book_name = getSelectBook(soup)
                lessons = getLessons(soup)
                print '#' * 100
                if not book_name: continue
                book_id = db.InsertTb(
                    bookSQL,
                    [(book_name[0], book_name[1], book_name[2], grade_href)])
                print book_href, '_'.join(book_name), len(lessons)
                values = []
                for title, href in lessons:
                    print '  ' * 3, title, book_href + href
                    values.append((book_id, title, href, 0))
                db.InsertTb(lessonSQL, values)
    db.CloseDb()
    print 'Over!'