示例#1
0
    def handle_info_html(self, html, type_tag):

        soup = BeautifulSoup(html, 'lxml')
        book = Book()

        # type_id = db.session.query(Type).filter_by(title=tag).first().id
        try:
            title = soup.h1.span.get_text()
            info = soup.find(class_='article').find(class_='indent').find(
                class_='subjectwrap clearfix').find(
                    class_='subject clearfix').find(id='info')
            string = info.get_text().strip()
            string = string.replace(' ', '')
            string = string.replace(' ', '')
            string = string.replace('\n', '')
            tag_list = [
                '出版社:', '出品方:', '副标题:', '原作名:', '译者:', '出版年:', '页数:', '定价:',
                '装帧:', '丛书:', 'ISBN:'
            ]
            value_list = []
            if '作者:' in string:
                string = string.replace('作者:', '')

            flag = 0
            for tag in tag_list:
                if tag in string:
                    value = string.split(tag)[0]
                    value_list.append(value)
                    if flag != 0:
                        for i in range(flag):
                            value_list.append('')
                        flag = 0
                else:
                    flag += 1
                    continue
                string = string.split(tag)[1]
                if tag == 'ISBN:':
                    value_list.append(string)

            author = value_list[0]
            publisher = value_list[1]
            producer = value_list[2]
            subtitle = value_list[3]
            original_title = value_list[4]
            translator = value_list[5]
            year_of_publisher = value_list[6]
            pages = value_list[7]
            price = value_list[8]
            binding = value_list[9]
            series = value_list[10]
            isbn = value_list[11]

            pic_href = soup.find(class_='article').find(class_='indent').find(
                class_='subjectwrap clearfix').find(
                    class_='subject clearfix').find(id='mainpic').a['href']

            score = soup.find(
                class_='rating_self clearfix').strong.get_text().strip()

            score_people = soup.find(class_='rating_people').get_text()

            related_info = soup.find(class_='related_info')

            infos = related_info.find_all(class_='indent')[:2]

            content_info = str(infos[0].find(class_='intro')).replace(
                '<div class="intro">', '')
            author_info = str(infos[1].find(class_='intro')).replace(
                '<div class="intro">', '')

            book.title = title
            book.author = author
            book.publisher = publisher
            book.producer = producer
            book.translator = translator
            book.subtitle = subtitle
            book.original_title = original_title
            book.year_of_publisher = year_of_publisher
            book.pages = pages
            book.price = price
            book.binding = binding
            book.series = series
            book.isbn = isbn
            book.score = score
            book.score_people = score_people
            book.type = type_tag
            book.content_info = content_info
            book.author_info = author_info
            book.pic_href = pic_href

            self.safe_commit(book)
        except Exception as e:
            self.logger.warning('爬起失败', e)
            return True
        return False