def get_movies(username, celebrity, start_number, role='performer'): choose_role = {'performer':'A', 'director':'D'} movie_role = choose_role[role] url = 'http://movie.douban.com/celebrity/%s/movies?start=%s&format=text&sortby=vote&role=%s'%(celebrity.ID, start_number, movie_role) soup = get_soup(url, timeout=15) movie_htmls = soup.findAll('a', href=re.compile('http://movie.douban.com/subject/\d{7,8}')) star_htmls = soup.findAll('span', class_='rating_nums') movie_IDs = [re.search('\d{7,8}', movie_html['href']).group() for movie_html in movie_htmls] movie_names = [movie_html.text for movie_html in movie_htmls] stars = [star_html.text for star_html in star_htmls] recommend_movies = Movie_list([Recommend_movie(movie_ID, movie_name, star, score=celebrity.final_score) for movie_ID, movie_name, star in zip(movie_IDs, movie_names, stars)]) choose_list = {0:second_page_celebrities, 25:third_page_celebrities, 50:[]} exist_html = soup.find("span", class_='allstar00') if not exist_html: choose_list[start_number].append(celebrity) for movie in recommend_movies: movie.add_celebrity(celebrity) movie_list.extends(recommend_movies, celebrity) print('4.celebrity ID %s OK '%(celebrity.ID))
def get_movies(username, celebrity, start_number, role='performer'): choose_role = {'performer':'A', 'director':'D'} movie_role = choose_role[role] url = 'http://movie.douban.com/celebrity/%s/movies?start=%s&format=text&sortby=vote&role=%s'%(celebrity.ID, start_number, movie_role) soup = get_soup(url, timeout=8) movie_htmls = soup.findAll('a', href=re.compile('http://movie.douban.com/subject/\d{7,8}')) star_htmls = soup.findAll('span', class_='rating_nums') movie_IDs = [re.search('\d{7,8}', movie_html['href']).group() for movie_html in movie_htmls] movie_names = [movie_html.text for movie_html in movie_htmls] stars = [star_html.text for star_html in star_htmls] recommend_movies = Movie_list([Recommend_movie(movie_ID, movie_name, star, score=celebrity.final_score) for movie_ID, movie_name, star in zip(movie_IDs, movie_names, stars)]) choose_list = {0:second_page_celebrities, 25:third_page_celebrities, 50:[]} exist_html = soup.find("span", class_='allstar00') if not exist_html: choose_list[start_number].append(celebrity) for movie in recommend_movies: movie.add_celebrity(celebrity) movie_list.extends(recommend_movies, celebrity) print('4.celebrity ID %s OK '%(celebrity.ID))
def get_celebrities(username, star, star_movie_ID): url = 'http://movie.douban.com/subject/%s/'%(star_movie_ID) soup = get_soup(url, timeout=15) #celebrity celebrity_htmls = soup.findAll('a', {'rel':'v:starring'}, href=re.compile('/celebrity/\d{7}'), limit=4) page_celebrity_IDs = [re.search('(\d{7})', celebrity_html['href']).group() for celebrity_html in celebrity_htmls] page_celebrity_names =[celebrity.text for celebrity in celebrity_htmls] #TODO!the directors are not included! directors_htmls = soup.findAll('a', {'rel':'v:directedBy'}, href=re.compile('/celebrity/\d{7}')) directors_IDs = [re.search('(\d{7})', directors_html['href']).group() for directors_html in directors_htmls] directors_names = [director.text for director in directors_htmls] page_directors = [Celebrity(directors_ID, original_score=star, name=name, role='director') for directors_ID, name in zip(directors_IDs, directors_names) ] page_celebrities = [Celebrity(page_celebrity_ID, original_score=star, name=name) for page_celebrity_ID,name in zip(page_celebrity_IDs, page_celebrity_names)] #movie information movie_name =soup.find('span', {'property':'v:itemreviewed'}).text movie = Movie(star_movie_ID, movie_name) for page_celebrity in page_celebrities: page_celebrity.add_loved_movie(movie) for page_director in page_directors: page_director.add_loved_movie(movie) star_directors.extends(page_directors, movie, star) star_celebrities.extends(page_celebrities, movie, star) print('3.OK %s movie ID'%(star_movie_ID))
def get_celebrities_pages(username): url = 'http://movie.douban.com/people/%s/celebrities'%(username) print('Start!') soup = get_soup(url, priority='high', timeout=2) title = soup.title.text pages = int(re.search('\((\d+)\)$', title).group(1)) return pages
def get_celebrities_pages(username): url = 'http://movie.douban.com/people/%s/celebrities' % (username) print('Start!') soup = get_soup(url, priority='high', timeout=10) title = soup.title.text pages = int(re.search('\((\d+)\)$', title).group(1)) return pages
def get_movies_pages(username): url = 'http://movie.douban.com/people/%s/collect' % (username) soup = get_soup(url, timeout=15, priority='high') title_text = soup.title.text movies_pages = (int(re.search('\((\d+)\)', title_text).group(1)) / 30 + 1) * 30 return movies_pages
def get_special(celebrity): url = 'http://movie.douban.com/celebrity/%s/' % (celebrity.ID) soup = get_soup(url, timeout=5) image_url = soup.find('img', title=u'点击看大图')['src'] pp = re.compile('medium') image_url = pp.sub('small', image_url) celebrity.image_url = image_url print('6.celebrity%s image' % (celebrity.ID))
def get_special(celebrity): url = 'http://movie.douban.com/celebrity/%s/'%(celebrity.ID) soup = get_soup(url, timeout=5) image_url = soup.find('img', title=u'点击看大图')['src'] pp = re.compile('medium') image_url = pp.sub('small', image_url) celebrity.image_url = image_url print('6.celebrity%s image'%(celebrity.ID))
def __analysis_article__(self, base_url): soup = get_soup(base_url) if not soup: return childrens = [] li = soup.find(name='li', class_='pager_theme_4') if li: [ childrens.append(child) for child in soup.find( name='li', class_='pager_theme_4').children ] childrens = list( filter( lambda x: str(x) != '\n' and str(x).__contains__('href'), childrens)) pages = [] pages.append(base_url) try: if (len(childrens) > 2): [ pages.append(base_url + '?pn=' + str(i)) for i in range( 2, int( re.findall(r'.*pn=(\d+)', childrens[ len(childrens) - 1]['href'])[0]) + 1) ] except Exception as ex: print('get_all_pages...exception:{} base_url:{}'.format( str(ex), base_url)) for page in pages: soup = get_soup(page) if soup: for content_div in soup.find_all( 'div', attrs={'class', 'l_post_bright'}): try: user, content = parse_user_and_content(content_div) self.lock.acquire() if self.all_user_contents.__contains__(user): self.all_user_contents[user].append(content) else: self.all_user_contents[user] = [content] self.lock.release() except Exception as ex: print(ex)
def __get_articles__(self): while len(self.pages) > 0: url = self.pages.pop() soup = get_soup(url) self.lock.acquire() self.article_urls.extend([ baidu_base_url + a['href'] for a in soup.findAll( 'a', attrs={'href': re.compile(r'^/p/\d{10}')}) ]) self.lock.release()
def get_celebrities(username, start_number): url = 'http://movie.douban.com/people/%s/celebrities?start=%s'%(username, start_number) soup = get_soup(url, timeout=4) page_celebrities = soup.findAll('a', href=re.compile('http://movie.douban.com/celebrity/\d{7}/$')) page_celebrities = [re.search('\d{7}', celebrity['href']).group() for celebrity in page_celebrities if page_celebrities.index(celebrity)%2 == 0] names_html = soup.findAll('em') names = [unicode(name.text) for name in names_html] page_celebrities = [Celebrity(page_celebrity, collect_or_watch='collect', original_score=5, name=name) for page_celebrity,name in zip(page_celebrities, names)] celebrities.extend(page_celebrities) print('1.collect page%s OK'%(start_number))
def _get_image(url, movie_ID): ''' Return the image's file path ,store the image's file # Parameters: * url : poster's url * movie_ID : movie_ID used for naming the image file ''' content = spider.get_soup(url, timeout=5, priority='high', content=True) #Warning!It is really a bad design!! image_url = os.path.join(dirname, 'media/poster/%s.jpg'%(movie_ID)) with open(image_url, 'wb') as tt: tt.write(content) image_url = '/picture/poster/%s.jpg'%(movie_ID) return image_url
def get_celebrities(username, star, star_movie_ID): url = 'http://movie.douban.com/subject/%s/' % (star_movie_ID) soup = get_soup(url, timeout=8) #celebrity if soup: celebrity_htmls = soup.findAll('a', {'rel': 'v:starring'}, href=re.compile('/celebrity/\d{7}'), limit=4) page_celebrity_IDs = [ re.search('(\d{7})', celebrity_html['href']).group() for celebrity_html in celebrity_htmls ] page_celebrity_names = [ celebrity.text for celebrity in celebrity_htmls ] directors_htmls = soup.findAll('a', {'rel': 'v:directedBy'}, href=re.compile('/celebrity/\d{7}')) directors_IDs = [ re.search('(\d{7})', directors_html['href']).group() for directors_html in directors_htmls ] directors_names = [director.text for director in directors_htmls] page_directors = [ Celebrity(directors_ID, original_score=star, name=name, role='director') for directors_ID, name in zip(directors_IDs, directors_names) ] page_celebrities = [ Celebrity(page_celebrity_ID, original_score=star, name=name) for page_celebrity_ID, name in zip( page_celebrity_IDs, page_celebrity_names) ] #movie information movie_name = soup.find('span', {'property': 'v:itemreviewed'}).text movie = Movie(star_movie_ID, movie_name) for page_celebrity in page_celebrities: page_celebrity.add_loved_movie(movie) for page_director in page_directors: page_director.add_loved_movie(movie) star_directors.extends(page_directors, movie, star) star_celebrities.extends(page_celebrities, movie, star) print('3.OK %s movie ID' % (star_movie_ID))
def get_movies(username, start_number): url = 'http://movie.douban.com/people/%s/collect?start=%s&mode=list'%(username, start_number) soup = get_soup(url, timeout=10) htmls = soup.findAll('li', id=re.compile('list\d{7,8}'), class_=re.compile('item')) for html in htmls: star_html = html.find('span', class_=re.compile('rating\d-t')) if star_html: star_html = star_html['class'][0] else: star_html = None movie_ID = re.search('\d{7,8}', html['id']).group() if star_html == 'rating5-t': five_star_movies_IDs.append(movie_ID) elif star_html == 'rating4-t': four_star_movies_IDs.append(movie_ID) movies_have_seen.append(Movie(movie_ID)) print('2.start number %s'%(start_number))
def get_final_movies(movie): def find_author_comment(star): star = soup.find('span', class_=("allstar%s0 rating" % (star))) if star: comment = star.parent.parent.next_sibling.next_sibling.next comment = str(comment) author = star.previous_sibling.previous_sibling.text else: comment = None author = None return comment, author def get_comment(): is_movie = not bool(soup.find('div', class_='episode_list')) audience_number = soup.find('span', {'property': 'v:votes'}) if audience_number: audience_number = audience_number.text else: audience_number = 0 has_enough_audience = True if int(audience_number) > 250 else False if is_movie and has_enough_audience: four_comment, four_author = find_author_comment(4) five_comment, five_author = find_author_comment(5) if not five_comment: movie.comment = Comment(four_comment, four_author, 4) else: movie.comment = Comment(five_comment, five_author, 5) return movie.comment url = 'http://movie.douban.com/subject/%s/' % (movie.ID) soup = get_soup(url, timeout=8) comment = get_comment() #Poster url poster_url = soup.find('img', {'rel': 'v:image'}, {'title': u'点击看更多海报'})['src'] if comment: movie.poster_url = poster_url print('5.Ok %s' % (movie.ID)) final_movies.append(movie)
def get_movies(username, start_number): url = 'http://movie.douban.com/people/%s/collect?start=%s&mode=list' % ( username, start_number) soup = get_soup(url, timeout=10) if soup: htmls = soup.findAll('li', id=re.compile('list\d{7,8}'), class_=re.compile('item')) for html in htmls: star_html = html.find('span', class_=re.compile('rating\d-t')) if star_html: star_html = star_html['class'][0] else: star_html = None movie_ID = re.search('\d{7,8}', html['id']).group() if star_html == 'rating5-t': five_star_movies_IDs.append(movie_ID) elif star_html == 'rating4-t': four_star_movies_IDs.append(movie_ID) movies_have_seen.append(Movie(movie_ID)) print('2.start number %s' % (start_number))
def get_final_movies(movie): def find_author_comment(star): star = soup.find('span', class_=("allstar%s0 rating"%(star))) if star: comment = star.parent.parent.next_sibling.next_sibling.next comment = str(comment) author = star.previous_sibling.previous_sibling.text else: comment = None author = None return comment, author def get_comment(): is_movie = not bool(soup.find('div', class_='episode_list')) audience_number = soup.find('span', {'property':'v:votes'}) if audience_number: audience_number = audience_number.text else: audience_number = 0 has_enough_audience = True if int(audience_number)>250 else False if is_movie and has_enough_audience: four_comment,four_author = find_author_comment(4) five_comment,five_author = find_author_comment(5) if not five_comment: movie.comment = Comment(four_comment, four_author, 4) else: movie.comment = Comment(five_comment, five_author, 5) return movie.comment url = 'http://movie.douban.com/subject/%s/'%(movie.ID) soup = get_soup(url, timeout=8) comment = get_comment() #Poster url poster_url = soup.find('img', {'rel':'v:image'}, {'title':u'点击看更多海报'})['src'] if comment: movie.poster_url = poster_url print('5.Ok %s'%(movie.ID)) final_movies.append(movie)
def get_celebrities(username, start_number): url = 'http://movie.douban.com/people/%s/celebrities?start=%s' % ( username, start_number) soup = get_soup(url, timeout=10) page_celebrities = soup.findAll( 'a', href=re.compile('http://movie.douban.com/celebrity/\d{7}/$')) page_celebrities = [ re.search('\d{7}', celebrity['href']).group() for celebrity in page_celebrities if page_celebrities.index(celebrity) % 2 == 0 ] names_html = soup.findAll('em') names = [unicode(name.text) for name in names_html] page_celebrities = [ Celebrity(page_celebrity, collect_or_watch='collect', original_score=5, name=name) for page_celebrity, name in zip(page_celebrities, names) ] celebrities.extend(page_celebrities) print('1.collect page%s OK' % (start_number))
def parse_user(url): soup = get_soup(url) if not soup: return None userInfo = soup.find(name='div', attrs={'class': 'userinfo_userdata'}) userinfo__head = soup.find(name='div', attrs={'id': 'j_userhead'}) if userInfo: sex = userInfo.find(name='span', attrs={'class': 'userinfo_sex_male'}) if sex: sex_ = 'male' else: sex_ = 'female' try: user_head = userinfo__head.find( name='img', attrs={'src': re.compile(r'http://?.*')})['src'] except Exception as ex: print(ex) user_head = '' name_ = re.findall(r'用户名:(.+?)<', str(userInfo))[0] age_ = re.findall(r'吧龄:(.+?)<', str(userInfo))[0] titles_ = re.findall(r'发贴:(.+?)<', str(userInfo))[0] u = User(name_, user_head, url, sex_, age_, titles_) return str(json.dumps(u.__dict__, ensure_ascii=False)) return str(json.dumps(User(url=url).__dict__, ensure_ascii=False))
def get_final_movies(movie): url = 'http://movie.douban.com/subject/%s/'%(movie.ID) soup = get_soup(url, timeout=15) is_movie = not bool(soup.find('div', class_='episode_list')) def find_author_comment(star): star = soup.find('span', class_=("allstar%s0 rating"%(star))) if star: comment = star.parent.parent.next_sibling.next_sibling.next author = star.previous_sibling.previous_sibling.text else: comment = None author = None return comment, author if is_movie: four_comment,four_author = find_author_comment(4) five_comment,five_author = find_author_comment(5) movie.comment = [Comment(four_comment, four_author, 4), Comment(five_comment, five_author, 5)] #Poster url poster_url = soup.find('img', {'rel':'v:image'}, {'title':u'点击看更多海报'})['src'] movie.poster_url = poster_url print('5.Ok %s'%(movie.ID)) final_movies.append(movie)
def get_special(celebrity): url = 'http://movie.douban.com/celebrity/%s/'%(celebrity.ID) soup = get_soup(url, timeout=12) image_url = soup.find('img', title=u'点击看大图')['src'] celebrity.image_url = image_url
import spider import download # 设置根url 获取唯品会商品页面内容 root_url = 'https://category.vip.com/' soup = spider.get_soup(root_url) # 抓取tree_id、c_id tree_id = spider.get_tree_id(soup) c_id = spider.get_c_id(soup) for c_item in c_id[6:7]: # 使用tree_id、c_id设置获取类别对应链接的url format_url = 'https://category.vip.com/ajax/getTreeList.php?cid={}&tree_id={}'.format( c_item, tree_id) url_soup = spider.get_soup(format_url) name_dic = spider.get_name_dic(url_soup) url_dic = spider.get_url_dic(url_soup, name_dic) for folder, url in url_dic.items(): folder = spider.folder_name_simplify(folder) product_list = spider.get_product_list(url) # 由于唯品会网站限制,先取前五十个进行存取 download.down_img(folder, product_list[:50]) if len(product_list) >= 50: download.down_img(folder, product_list[50:]) print(folder + ' 已下载完成')
def get_movies_pages(username): url = 'http://movie.douban.com/people/%s/collect?&mode=list'%(username) soup = get_soup(url, timeout=1, priority='high') title_text = soup.title.text movies_pages = (int(re.search('\((\d+)\)', title_text).group(1))/30 + 1)*30 return movies_pages