def parse_page(url, thread_num, counter): """ parse function for each page""" # get main page soup main_page_soup = parser.get_main_page_soup(url) # request the website and get the elements movie_links = parser.get_movie_page_list(main_page_soup) # get next page url next_page = parser.get_next_page_url(main_page_soup) # loop through each movie box in the main page for i in movie_links: # get av num from the soup av_num = parser.get_av_num(i) # skip existed movie if database.check_existence(av_num): print('* 已存在 %s 停止爬取 *' % av_num) continue # get view page soup soup = parser.get_link_soup(i) # show current working status print('Thread {} 正在扒取:第 {} 页 番号:{}'.format(str(thread_num), str(os.path.basename(url)), av_num)) # get movie object info movie = parser.get_movie(soup, av_num) # show movie object # print(movie) stars = parser.get_star_list(soup) links = parser.get_download_link(soup, url, av_num) images = parser.get_sample_img_list(soup) # store movie info to database database.insert_movie(movie) # store star info to database for s in stars: database.insert_star(s, av_num) # store links info to database for l in links: database.insert_magnet(l) # store images url to database for g in images: database.insert_img(g, av_num) counter.increment_parse() print('第 ' + str(os.path.basename(url)) + ' 页扒取完毕') print('-------------------------') return next_page
def crawl_movie(av_info, expt_queue, proc_num): # get movie data, genre data, star data, images data from javbus website according avnum av_num = get_av_num(av_info[0]) if database.check_existence(av_num) is True: return print('Process {} get avnum: {}'.format(proc_num, av_num)) # get movie soup soup = get_movie_soup(av_info, proc_num) # get movie class try: movie = get_movie_class(soup, av_num, av_info[1]) except AvExpt as e: print('Process {} facing exception when database inserting:'.format( proc_num)) print(e) expt_queue.put(e) return print('movie class:', movie) # get starID list of a movie star_id_iter = get_star_iter(soup) # get genre list of a movie genres = get_genre_iter(soup) # get links of a movie try: link_iter = get_download_iter(soup, av_info[0], av_num, proc_num) except AvExpt as e: print( 'Process {} facing exception when crawling link:'.format(proc_num)) print(e) expt_queue.put(e) return # get sample images of a movie images = get_sample_img_iter(soup) # store movie info to database try: database.insert_movie(movie) except Exception as e: print( 'Process {} facing exception when insert movie:'.format(proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'movie_insert')) # store movie and star info to database for s in star_id_iter: if database.check_stars(s[0]): # print('Process{},avnum: {},starid: {}'.format(proc_num, av_num, s[0])) try: database.insert_m_s(av_num, s[0]) except Exception as e: print('Process {} facing exception when insert m_s:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'm_s_insert')) else: try: p = get_star(s, proc_num) except AvExpt as e: print('Process {} facing exception when crawling star:'.format( proc_num)) print(e) expt_queue.put(e) continue try: database.insert_star(p) except Exception as e: print('Process {} facing exception when insert star:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'star_insert')) try: database.insert_m_s(av_num, s[0]) except Exception as e: print('Process {} facing exception when insert m_s:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'm_s_insert')) for g in genres: if database.check_genres(g[0]): try: database.insert_m_g(av_num, g[0]) except Exception as e: print('Process {} facing exception when insert m_g:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'm_g_insert')) else: try: p = crawl_genre(g, proc_num) except AvExpt as e: print( 'Process {} facing exception when crawling genre:'.format( proc_num)) print(e) expt_queue.put(e) continue try: database.insert_genre(p) except Exception as e: print('Process {} facing exception when insert genre:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'genre_insert')) try: database.insert_m_g(av_num, g[0]) except Exception as e: print('Process {} facing exception when insert m_g:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'm_g_insert')) # store links info to database for li in link_iter: try: database.insert_magnet(li) except Exception as e: print('Process {} facing exception when insert magnet:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'magnet_insert')) # store images url to database for im in images: try: database.insert_img(im, av_num) except Exception as e: print('Process {} facing exception when insert img:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'img_insert')) print('Process {} 已扒取完毕:第 {} 页 番号:{}'.format( str(proc_num), str(os.path.basename(av_info[0])), av_num)) return