Python StringUtils示例，douban.utils.StringUtils Python示例

示例#1

0

显示文件

文件： home_parse.py 项目： hehuiran/Crawl

 def get_data(self, item):
     div_pic = item.find('div', 'market-spu-pic')
     style = div_pic.get('style')
     p = re.compile(r'[(](.*?)[)]', re.S)
     img = re.findall(p, style)[0]
     a_title = item.find('a', 'market-spu-title')
     url = a_title.get('href')
     title = StringUtils.filter_space_and_enter(a_title.string)
     des = StringUtils.filter_space_and_enter(
         item.find('span', 'market-spu-price').string)
     return [img, url, title, des]

示例#2

0

显示文件

文件： home_parse.py 项目： hehuiran/Crawl

 def get_data(self, item):
     img_tag = item.find('img')
     img = img_tag.get('data-origin')
     div_title = item.find('div', 'title')
     a_title = div_title.find('a')
     url = a_title.get('href')
     title = a_title.get('title')
     des = StringUtils.filter_space_and_enter(
         item.find('div', 'follow').string)
     name = item.find('address').get('title')
     time = StringUtils.filter_space_and_enter(
         item.find('div', 'datetime').string)
     return [img, url, title, des, name, time]

示例#3

0

显示文件

 def get_data(self, item):
     img = item.find('img').get('src')
     a_tag = item.find('a')
     book_id = re.sub('\D', '', a_tag.get('href'))
     title = a_tag.get('title')
     author = StringUtils.filter_space_and_enter(
         item.find('div', 'author').string)
     return [img, book_id, title, author]

示例#4

0

显示文件

文件： home_parse.py 项目： hehuiran/Crawl

 def get_data(self, item):
     img_tag = item.find('img')
     img = img_tag.get('data-origin')
     url = item.find('a').get('href')
     title = img_tag.get('alt')
     div_price = item.find('div', 'price')
     des = '' if div_price is None else StringUtils.filter_space_and_enter(
         div_price.string)
     name = item.find('div', 'author').string
     return [img, url, title, des, name]

示例#5

0

显示文件

 def get_data(self, item):
     img_tag = item.find('img')
     img = img_tag.get('src')
     book_id = re.sub('\D', '', item.find('a').get('href'))
     title = img_tag.get('alt')
     author = StringUtils.filter_space_and_enter(
         item.find('p', 'author').string)
     score = StringUtils.filter_space_and_enter(
         item.find('span', 'average-rating').string)
     source = StringUtils.filter_space_and_enter(
         item.find('p', 'book-list-classification').string)
     p_review_tag = item.find('p', 'reviews')
     information_title = p_review_tag.find('a').string
     texts = p_review_tag.get_text("|", strip=True)
     information_des = StringUtils.filter_space_and_enter(
         str(texts).split("|")[0])
     return [
         img, book_id, title, author, score, information_title, source,
         re.sub('\(', '', information_des)
     ]

示例#6

0

显示文件

 def get_data(self, item):
     url = item.find('a').get('href')  # type:str
     information_title = item.find('span', 'title').string
     source = item.find('span', 'meta').string
     information_des = StringUtils.filter_space_and_enter(
         item.find('p', 'abstract').string)
     html = requests.get(url).text
     bs = BeautifulSoup(html, 'html.parser')
     # [img, book_id, title, author, score, information_title, source, information_des]
     array = InformationBookParser.parse_note(bs) if url.find(
         'note') >= 0 else InformationBookParser.parse_review(bs)
     if array is None:
         return None
     array.append(information_title)
     array.append(source)
     array.append(information_des)
     return array

示例#7

0

显示文件

 def get_data(self, item):
     img = item.find('img').get('src')
     div_tag = item.find('div', 'pl2')
     a_tag = div_tag.find('a')
     book_id = re.sub('\D', '', a_tag.get('href'))
     title = a_tag.get('title')
     author = item.find('p', 'pl').string
     score = item.find('span', 'rating_nums').string
     source = StringUtils.filter_space_and_enter(
         item.find('span', 'pl').string)
     span_tag = div_tag.find('span')
     information_title = '' if span_tag is None else span_tag.string
     des_tag = item.find('span', 'inq')
     information_des = '' if des_tag is None else des_tag.string
     return [
         img, book_id, title, author, score, information_title, source,
         information_des
     ]

示例#8

0

显示文件

文件： lable_book.py 项目： hehuiran/Crawl

    def _crawl_book_des(self, url, start, book_sub_label: BookSubLabel):
        url = url + start
        html = requests.get(url=url).text
        bs = BeautifulSoup(html, 'html.parser')
        # div_subject_list = bs.find('div', id='subject_list')
        # ul_subject_list = div_subject_list.find('ul', 'subject-list')
        # lis = ul_subject_list.find_all('li', 'subject-item')
        lis = bs.find('div', id='subject_list').find('ul',
                                                     'subject-list').find_all(
                                                         'li', 'subject-item')
        if lis is None:
            return False
        # img, book_id, title, author, score, press,producers, sub_title,origin_name, translator,
        # publish_time, page, price, comment_num, star, content_des, author_des, tags
        for li in lis:
            book_id = re.sub('\D', '', li.find('a', 'nbg').get('href'))
            url = 'https://book.douban.com/subject/' + book_id
            html = requests.get(url=url).text
            bs = BeautifulSoup(html, 'html.parser')
            # div_wrapper = bs.find('div', id='wrapper')
            # title = div_wrapper.find('span').string

            div_article = bs.find('div', id='content').find('div', 'article')
            img_tag = div_article.find('div', id='mainpic').find('img')
            img = img_tag.get('src')
            title = img_tag.get('alt')

            div_info = div_article.find('div', id='info')
            a_tags = div_info.find_all('a')
            author = a_tags[0].string

            origin_str = str(div_info)
            press = _get_middle_str(origin_str, '出版社:')

            producers = _get_pattern_middle_str(
                r'<span class="pl">出品方:</span>(.*?)</a>',
                origin_str)  # type:str
            if not StringUtils.is_empty(producers):
                producers = re.sub('<.*?>', '', producers)

            sub_title = _get_middle_str(origin_str, '副标题:')
            origin_name = _get_middle_str(origin_str, '原作名:')
            translator = _get_pattern_middle_str(
                r'<span class="pl"> 译者</span>(.*?)</a>', origin_str)
            if not StringUtils.is_empty(translator):
                translator = re.sub('<.*?>', '',
                                    re.sub('[:\n\xa0 ·]', '', translator))

            publish_time = _get_middle_str(origin_str, '出版年:')
            page = _get_middle_str(origin_str, '页数:')
            price = _get_middle_str(origin_str, '定价:')

            div_interest = div_article.find('div', id='interest_sectl')

            score = div_interest.find('strong', 'll rating_num ').string
            comment_num = div_interest.find(
                'a', 'rating_people').find('span').string

            star = ''
            spans = div_interest.find_all('span', 'rating_per')
            for span in spans:
                star = star + span.string

            div_related_info = div_article.find('div', 'related_info')

            content_des = ''
            div_indent_content = div_related_info.find('div',
                                                       'indent',
                                                       id='link-report')
            if div_indent_content is not None:
                content_all = div_indent_content.find('span', 'all hidden')
                if content_all is not None:
                    div_intro_content = content_all.find('div', 'intro')
                else:
                    div_intro_content = div_indent_content.find('div', 'intro')
                if div_intro_content is not None:
                    p_array_content = div_intro_content.find_all('p')
                    for i in range(len(p_array_content)):
                        text = p_array_content[i].string
                        if text is not None:
                            content = text if i == len(
                                p_array_content) - 1 else text + '\\n'
                            content_des = content_des + content

            author_des = ''
            div_indent_author = div_related_info.find_all('div', 'indent')[1]
            if div_indent_author is not None:
                author_all = div_indent_author.find('span', 'all hidden')
                if author_all is not None:
                    div_intro_author = author_all.find('div', 'intro')
                else:
                    div_intro_author = div_indent_author.find('div', 'intro')
                if div_intro_author is not None:
                    p_array_author = div_intro_author.find_all('p')
                    for i in range(len(p_array_author)):
                        text = p_array_author[i].string
                        if text is not None:
                            des = text if i == len(
                                p_array_author) - 1 else text + '\\n'
                            author_des = author_des + des

            div_tag = div_related_info.find('div', id='db-tags-section')
            spans_tag = div_tag.find('div', 'indent').find_all('span')
            tags = ''
            for i in range(len(spans_tag)):
                tag_text = spans_tag[i].find('a').string
                des = tag_text if i == len(spans_tag) - 1 else tag_text + ','
                tags = tags + des

            print(img + '->' + book_id + '->' + title + '->' + author + '->' +
                  score + '->' + press + '->' + producers + '->' + sub_title +
                  '->' + origin_name + '->' + translator + '->' +
                  publish_time + page + '->' + price + '->' + comment_num +
                  '->' + star + '->' + content_des + '->' + author_des + '->' +
                  tags)

            book_des = BookDes(str(img), str(book_id), str(title), str(author),
                               str(score), str(press), str(producers),
                               str(sub_title), str(origin_name),
                               str(translator), str(publish_time), str(page),
                               str(price), str(comment_num), str(star),
                               str(content_des), str(author_des), str(tags))
            book_sub_label.book_des_array.append(book_des)

        return True