示例#1
0
def crawl_avnum(genreid, avnum_queue, expt_queue, proc_num):
    # get avnum from javbus website according genreid, and put avnums into avnum_queue
    next_page = 1
    max_page = 1
    while next_page <= max_page:
        url = get_page_url(next_page, genreid)
        if next_page == 1:
            next_page_soup = get_list_page_soup(url, genreid, proc_num)
            movie_sum = int(
                next_page_soup.find_all('a', {
                    'class': 'mypointer',
                    'id': 'resultshowmag'
                })[0].get_text().split()[-1])
            max_page = math.ceil(movie_sum / 30)
            print('genre {} 共{}页'.format(genreid, max_page))
        else:
            try:
                next_page_soup = get_list_page_soup(url, genreid, proc_num)
            except AvExpt as e:
                print(e)
                expt_queue.put(e)
                continue

        movie_links = get_movie_list(next_page_soup)
        for i in movie_links:

            # get av num from the soup
            av_num = get_av_num(i[0])

            # skip existed movie
            if database.check_existence(av_num):
                # print('* 已存在 %s 停止爬取 *' % av_num)
                continue
            else:
                avnum_queue.put(i)
                # print('* {} 放入队列 *'.format(av_num))
        next_page += 1
示例#2
0
engine.declare(KnowledgeEngine.SocialFact(futureStrategy=AnswerArrayCP[42]))
engine.declare(KnowledgeEngine.SocialFact(sensibleResolvePossible=AnswerArrayCP[43]))

engine.run()  # Executes first engine with the questionnaire and contextual questions & runs it

InputArray = []
Username = "******"
InputArray.append(Username)
InputArray.append(individualDepressionLevel)
InputArray.append(individualAnxietyLevel)
InputArray.append(individualStressLevel)
for x in range(26, 44):
    InputArray.append(AnswerArrayCP[x])
InputArray.append(maxIndValueOfDAS)
# If this is the first execution, do not compare "Old" values with "new" ones
if database.check_existence(Username) is None:
    print("No previous data found")
# Else, compare the values and run the Knowledge Engine Two after Knowledge Engine One, too
else:
    dbOldFactsArray = []
    dbOldFactsArray = database.read_from_db(Username)  # is this really an array

    # Start Knowledge-Engine
    engineTwo = KnowledgeEngineTwo.ComparingOldInputWithNew()
    engineTwo.reset()  # Prepare the engine for the execution.

    # EngineTwo Declaring New Facts
    engineTwo.declare(KnowledgeEngineTwo.NewFinancialFact(financialDistress=AnswerArrayCP[26]))
    engineTwo.declare(KnowledgeEngineTwo.NewFinancialFact(employment=AnswerArrayCP[27]))
    engineTwo.declare(KnowledgeEngineTwo.NewFamilyFact(isCaretaker=AnswerArrayCP[28]))
    engineTwo.declare(KnowledgeEngineTwo.NewFamilyFact(getsEnoughSupport=AnswerArrayCP[29]))
示例#3
0
def parse_page(url, thread_num, counter):
    """ parse function for each page"""

    # get main page soup
    main_page_soup = parser.get_main_page_soup(url)

    # request the website and get the elements
    movie_links = parser.get_movie_page_list(main_page_soup)

    # get next page url
    next_page = parser.get_next_page_url(main_page_soup)

    # loop through each movie box in the main page
    for i in movie_links:

        # get av num from the soup
        av_num = parser.get_av_num(i)

        # skip existed movie
        if database.check_existence(av_num):
            print('* 已存在 %s 停止爬取 *' % av_num)
            continue

        # get view page soup
        soup = parser.get_link_soup(i)

        # show current working status
        print('Thread {} 正在扒取:第 {} 页 番号:{}'.format(str(thread_num),
                                                   str(os.path.basename(url)),
                                                   av_num))

        # get movie object info
        movie = parser.get_movie(soup, av_num)

        # show movie object
        # print(movie)

        stars = parser.get_star_list(soup)
        links = parser.get_download_link(soup, url, av_num)

        images = parser.get_sample_img_list(soup)

        # store movie info to database
        database.insert_movie(movie)

        # store star info to database
        for s in stars:
            database.insert_star(s, av_num)

        # store links info to database
        for l in links:
            database.insert_magnet(l)

        # store images url to database
        for g in images:
            database.insert_img(g, av_num)

        counter.increment_parse()

    print('第 ' + str(os.path.basename(url)) + ' 页扒取完毕')
    print('-------------------------')

    return next_page
示例#4
0
 def test_db_check_existence(self):
     username = database.check_existence("Anna")
     self.assertEqual(None, username)
示例#5
0
def crawl_movie(av_info, expt_queue, proc_num):
    # get movie data, genre data, star data, images data from javbus website according avnum
    av_num = get_av_num(av_info[0])
    if database.check_existence(av_num) is True:
        return
    print('Process {} get avnum: {}'.format(proc_num, av_num))
    # get movie soup
    soup = get_movie_soup(av_info, proc_num)

    # get movie class
    try:
        movie = get_movie_class(soup, av_num, av_info[1])
    except AvExpt as e:
        print('Process {} facing exception when database inserting:'.format(
            proc_num))
        print(e)
        expt_queue.put(e)
        return
    print('movie class:', movie)

    # get starID list of a movie
    star_id_iter = get_star_iter(soup)

    # get genre list of a movie
    genres = get_genre_iter(soup)

    # get links of a movie
    try:
        link_iter = get_download_iter(soup, av_info[0], av_num, proc_num)
    except AvExpt as e:
        print(
            'Process {} facing exception when crawling link:'.format(proc_num))
        print(e)
        expt_queue.put(e)
        return

    # get sample images of a movie
    images = get_sample_img_iter(soup)

    # store movie info to database
    try:
        database.insert_movie(movie)
    except Exception as e:
        print(
            'Process {} facing exception when insert movie:'.format(proc_num))
        print(e)
        expt_queue.put(
            AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                   'movie_insert'))

    # store movie and star info to database
    for s in star_id_iter:
        if database.check_stars(s[0]):
            # print('Process{},avnum: {},starid: {}'.format(proc_num, av_num, s[0]))
            try:
                database.insert_m_s(av_num, s[0])
            except Exception as e:
                print('Process {} facing exception when insert m_s:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'm_s_insert'))
        else:
            try:
                p = get_star(s, proc_num)
            except AvExpt as e:
                print('Process {} facing exception when crawling star:'.format(
                    proc_num))
                print(e)
                expt_queue.put(e)
                continue
            try:
                database.insert_star(p)
            except Exception as e:
                print('Process {} facing exception when insert star:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'star_insert'))
            try:
                database.insert_m_s(av_num, s[0])
            except Exception as e:
                print('Process {} facing exception when insert m_s:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'm_s_insert'))

    for g in genres:
        if database.check_genres(g[0]):
            try:
                database.insert_m_g(av_num, g[0])
            except Exception as e:
                print('Process {} facing exception when insert m_g:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'm_g_insert'))

        else:
            try:
                p = crawl_genre(g, proc_num)
            except AvExpt as e:
                print(
                    'Process {} facing exception when crawling genre:'.format(
                        proc_num))
                print(e)
                expt_queue.put(e)
                continue
            try:
                database.insert_genre(p)
            except Exception as e:
                print('Process {} facing exception when insert genre:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'genre_insert'))
            try:
                database.insert_m_g(av_num, g[0])
            except Exception as e:
                print('Process {} facing exception when insert m_g:'.format(
                    proc_num))
                print(e)
                expt_queue.put(
                    AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                           'm_g_insert'))

    # store links info to database
    for li in link_iter:
        try:
            database.insert_magnet(li)
        except Exception as e:
            print('Process {} facing exception when insert magnet:'.format(
                proc_num))
            print(e)
            expt_queue.put(
                AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                       'magnet_insert'))

    # store images url to database
    for im in images:
        try:
            database.insert_img(im, av_num)
        except Exception as e:
            print('Process {} facing exception when insert img:'.format(
                proc_num))
            print(e)
            expt_queue.put(
                AvExpt(proc_num, 'database_insert', av_info[0], str(e),
                       'img_insert'))

    print('Process {} 已扒取完毕:第 {} 页 番号:{}'.format(
        str(proc_num), str(os.path.basename(av_info[0])), av_num))

    return