def get_data(self, item): div_pic = item.find('div', 'market-spu-pic') style = div_pic.get('style') p = re.compile(r'[(](.*?)[)]', re.S) img = re.findall(p, style)[0] a_title = item.find('a', 'market-spu-title') url = a_title.get('href') title = StringUtils.filter_space_and_enter(a_title.string) des = StringUtils.filter_space_and_enter( item.find('span', 'market-spu-price').string) return [img, url, title, des]
def get_data(self, item): img_tag = item.find('img') img = img_tag.get('data-origin') div_title = item.find('div', 'title') a_title = div_title.find('a') url = a_title.get('href') title = a_title.get('title') des = StringUtils.filter_space_and_enter( item.find('div', 'follow').string) name = item.find('address').get('title') time = StringUtils.filter_space_and_enter( item.find('div', 'datetime').string) return [img, url, title, des, name, time]
def get_data(self, item): img = item.find('img').get('src') a_tag = item.find('a') book_id = re.sub('\D', '', a_tag.get('href')) title = a_tag.get('title') author = StringUtils.filter_space_and_enter( item.find('div', 'author').string) return [img, book_id, title, author]
def get_data(self, item): img_tag = item.find('img') img = img_tag.get('data-origin') url = item.find('a').get('href') title = img_tag.get('alt') div_price = item.find('div', 'price') des = '' if div_price is None else StringUtils.filter_space_and_enter( div_price.string) name = item.find('div', 'author').string return [img, url, title, des, name]
def get_data(self, item): img_tag = item.find('img') img = img_tag.get('src') book_id = re.sub('\D', '', item.find('a').get('href')) title = img_tag.get('alt') author = StringUtils.filter_space_and_enter( item.find('p', 'author').string) score = StringUtils.filter_space_and_enter( item.find('span', 'average-rating').string) source = StringUtils.filter_space_and_enter( item.find('p', 'book-list-classification').string) p_review_tag = item.find('p', 'reviews') information_title = p_review_tag.find('a').string texts = p_review_tag.get_text("|", strip=True) information_des = StringUtils.filter_space_and_enter( str(texts).split("|")[0]) return [ img, book_id, title, author, score, information_title, source, re.sub('\(', '', information_des) ]
def get_data(self, item): url = item.find('a').get('href') # type:str information_title = item.find('span', 'title').string source = item.find('span', 'meta').string information_des = StringUtils.filter_space_and_enter( item.find('p', 'abstract').string) html = requests.get(url).text bs = BeautifulSoup(html, 'html.parser') # [img, book_id, title, author, score, information_title, source, information_des] array = InformationBookParser.parse_note(bs) if url.find( 'note') >= 0 else InformationBookParser.parse_review(bs) if array is None: return None array.append(information_title) array.append(source) array.append(information_des) return array
def get_data(self, item): img = item.find('img').get('src') div_tag = item.find('div', 'pl2') a_tag = div_tag.find('a') book_id = re.sub('\D', '', a_tag.get('href')) title = a_tag.get('title') author = item.find('p', 'pl').string score = item.find('span', 'rating_nums').string source = StringUtils.filter_space_and_enter( item.find('span', 'pl').string) span_tag = div_tag.find('span') information_title = '' if span_tag is None else span_tag.string des_tag = item.find('span', 'inq') information_des = '' if des_tag is None else des_tag.string return [ img, book_id, title, author, score, information_title, source, information_des ]
def _crawl_book_des(self, url, start, book_sub_label: BookSubLabel): url = url + start html = requests.get(url=url).text bs = BeautifulSoup(html, 'html.parser') # div_subject_list = bs.find('div', id='subject_list') # ul_subject_list = div_subject_list.find('ul', 'subject-list') # lis = ul_subject_list.find_all('li', 'subject-item') lis = bs.find('div', id='subject_list').find('ul', 'subject-list').find_all( 'li', 'subject-item') if lis is None: return False # img, book_id, title, author, score, press,producers, sub_title,origin_name, translator, # publish_time, page, price, comment_num, star, content_des, author_des, tags for li in lis: book_id = re.sub('\D', '', li.find('a', 'nbg').get('href')) url = 'https://book.douban.com/subject/' + book_id html = requests.get(url=url).text bs = BeautifulSoup(html, 'html.parser') # div_wrapper = bs.find('div', id='wrapper') # title = div_wrapper.find('span').string div_article = bs.find('div', id='content').find('div', 'article') img_tag = div_article.find('div', id='mainpic').find('img') img = img_tag.get('src') title = img_tag.get('alt') div_info = div_article.find('div', id='info') a_tags = div_info.find_all('a') author = a_tags[0].string origin_str = str(div_info) press = _get_middle_str(origin_str, '出版社:') producers = _get_pattern_middle_str( r'<span class="pl">出品方:</span>(.*?)</a>', origin_str) # type:str if not StringUtils.is_empty(producers): producers = re.sub('<.*?>', '', producers) sub_title = _get_middle_str(origin_str, '副标题:') origin_name = _get_middle_str(origin_str, '原作名:') translator = _get_pattern_middle_str( r'<span class="pl"> 译者</span>(.*?)</a>', origin_str) if not StringUtils.is_empty(translator): translator = re.sub('<.*?>', '', re.sub('[:\n\xa0 ·]', '', translator)) publish_time = _get_middle_str(origin_str, '出版年:') page = _get_middle_str(origin_str, '页数:') price = _get_middle_str(origin_str, '定价:') div_interest = div_article.find('div', id='interest_sectl') score = div_interest.find('strong', 'll rating_num ').string comment_num = div_interest.find( 'a', 'rating_people').find('span').string star = '' spans = div_interest.find_all('span', 'rating_per') for span in spans: star = star + span.string div_related_info = div_article.find('div', 'related_info') content_des = '' div_indent_content = div_related_info.find('div', 'indent', id='link-report') if div_indent_content is not None: content_all = div_indent_content.find('span', 'all hidden') if content_all is not None: div_intro_content = content_all.find('div', 'intro') else: div_intro_content = div_indent_content.find('div', 'intro') if div_intro_content is not None: p_array_content = div_intro_content.find_all('p') for i in range(len(p_array_content)): text = p_array_content[i].string if text is not None: content = text if i == len( p_array_content) - 1 else text + '\\n' content_des = content_des + content author_des = '' div_indent_author = div_related_info.find_all('div', 'indent')[1] if div_indent_author is not None: author_all = div_indent_author.find('span', 'all hidden') if author_all is not None: div_intro_author = author_all.find('div', 'intro') else: div_intro_author = div_indent_author.find('div', 'intro') if div_intro_author is not None: p_array_author = div_intro_author.find_all('p') for i in range(len(p_array_author)): text = p_array_author[i].string if text is not None: des = text if i == len( p_array_author) - 1 else text + '\\n' author_des = author_des + des div_tag = div_related_info.find('div', id='db-tags-section') spans_tag = div_tag.find('div', 'indent').find_all('span') tags = '' for i in range(len(spans_tag)): tag_text = spans_tag[i].find('a').string des = tag_text if i == len(spans_tag) - 1 else tag_text + ',' tags = tags + des print(img + '->' + book_id + '->' + title + '->' + author + '->' + score + '->' + press + '->' + producers + '->' + sub_title + '->' + origin_name + '->' + translator + '->' + publish_time + page + '->' + price + '->' + comment_num + '->' + star + '->' + content_des + '->' + author_des + '->' + tags) book_des = BookDes(str(img), str(book_id), str(title), str(author), str(score), str(press), str(producers), str(sub_title), str(origin_name), str(translator), str(publish_time), str(page), str(price), str(comment_num), str(star), str(content_des), str(author_des), str(tags)) book_sub_label.book_des_array.append(book_des) return True