示例#1
0
    def get_book_details(self, url, book_id, book_capter_numb):
        html = self.get_html(url)
        soup = BeautifulSoup(html, 'lxml')
        book_capter_name = soup.select(
            '#wrapper .content_read .box_con .bookname h1')[0].text
        # print(book_capter_name)
        book_content = soup.select('#wrapper .content_read #content')[0].text
        # print(book_content)
        next_url = soup.select(
            '#wrapper .content_read .box_con .bookname .bottem1 a'
        )[2].attrs['href']
        # print(next_url)

        data_for_book_details = {
            BOOK_ID: book_id,
            DETAILS_NEXT_URL: next_url,
            DETAILS_CAPTER_NUMB: book_capter_numb,
            DETAILS_CAPTER_NAME: book_capter_name,
            DETAILS_CONTENT: book_content
        }
        #todo 将数据写入数据库
        # print('----------------------------开始插入book_details--------------------------------------')
        spidermongo = SpiderMongo()
        spidermongo.insert_data_dabases(BOOK_DETAILS_COLLECTION,
                                        data_for_book_details)
        # print('----------------------------结束插入book_details--------------------------------------')
        return next_url
示例#2
0
 def get_newest_capter_numb_from_mongodb(self, book_id):
     s = SpiderMongo()
     res = s.get_newest_capter_numb_from_mongodb(book_id)
     if len(res) == 0:
         return None
     else:
         return res[0]['book_capter_numb']
示例#3
0
    def update_book_infos_by_book_id(self, url, book_class, book_id):
        '''
        返回值: 最新的url地址
        '''
        book_id = self.get_id_from_url(url)
        html = self.get_html(url)
        soup = BeautifulSoup(html, 'lxml')
        book_st = soup.select('#wrapper .box_con #maininfo #info p')[1].text
        status = book_st.split(',')
        book_status = status[0]
        # print(book_status)
        # print("---------------------------------------------------------------------------------")
        book_last_updata_time = soup.select(
            '#wrapper .box_con #maininfo #info p')[2].text
        book_last_updata_desc = soup.select(
            '#wrapper .box_con #maininfo #info p')[3].text
        book_last_updata_url = soup.select(
            '#wrapper .box_con #maininfo #info p a')[2].attrs['href']

        data_for_book_infos = {
            INFOS_STATUS: book_status,
            INFOS_LAST_UPDATE_TIME: book_last_updata_time,
            INFOS_LAST_UPDATE_DESC: book_last_updata_desc,
            INFOS_LAST_UPDATE_URL: book_last_updata_url
        }
        s = SpiderMongo()
        res = s.update_book_infos_by_book_id(book_id, data_for_book_infos)
    def get_book_infos(self, url, book_class, book_id):
        html = self.get_html(url)
        soup = BeautifulSoup(html, 'lxml')
        book_name = soup.select('#wrapper .box_con #maininfo #info h1')[0].text
        # print(book_name)
        book_auhter = soup.select('#wrapper .box_con #maininfo #info p')[0].text
        # print(book_auhter)
        # print("---------------------------------------------------------------------------------")
        book_st = soup.select('#wrapper .box_con #maininfo #info p')[1].text
        status = book_st.split(',')
        book_status = status[0]
        # print(book_status)
        # print("---------------------------------------------------------------------------------")
        book_last_updata_time = soup.select('#wrapper .box_con #maininfo #info p')[2].text
        # print(book_last_updata_time)
        book_last_updata_desc = soup.select('#wrapper .box_con #maininfo #info p')[3].text
        # print(book_last_updata_desc)
        book_last_updata_url = soup.select('#wrapper .box_con #maininfo #info p a')[2].attrs['href']
        img_url = soup.select('#wrapper .box_con #sidebar #fmimg img')[0].attrs['src']
        book_desc = soup.select('#wrapper .box_con #maininfo #intro')[0].text
        # print(book_last_updata_url)
        # book_class = book_class
        # book_id = book_id
        data_for_book_infos = {
            BOOK_ID: int(book_id),
            INFOS_CLASS: book_class,
            INFOS_NAME: book_name,
            INFOS_AUTHER: book_auhter,
            INFOS_IMG_URL: img_url,
            INFOS_STATUS: book_status,
            INFOS_LAST_UPDATE_TIME: book_last_updata_time,
            INFOS_LAST_UPDATE_DESC: book_last_updata_desc,
            INFOS_LAST_UPDATE_URL: book_last_updata_url,
            INFOS_DECS: book_desc

        }
        # print("-----------------------------下载图片开始---------------------")
        # todo 写入集合即表:book_infos中
        filepath = BOOK_IMG_DIR + '/' + book_id + '.jpg'
        # print(filepath)
        # print(data_for_book_infos[INFOS_IMG_URL])
        try:
            urllib.request.urlretrieve(data_for_book_infos[INFOS_IMG_URL], filename=filepath)
            new_img_url = filepath
        except Exception as e:
            print("Error occurred when downloading file, error message:")
            print(e)
            new_img_url = None
        print(new_img_url)
        data_for_book_infos[INFOS_IMG_URL] = new_img_url
        # print('-------------------------------开始插入-----------------------------------')
        # print(BOOK_INFOS)
        # print(data_for_book_infos)
        spidermongo = SpiderMongo()
        spidermongo.insert_data_dabases(BOOK_INFOS_COLLECTION, data_for_book_infos)
        # print('--------------------------------结束插入----------------------------------')
        # 返回第一页地址
        first_url = soup.select('#wrapper .box_con #list dl dd a')[0].attrs['href']
        return first_url, book_last_updata_url
示例#5
0
 def is_book_id_in_mongodb(self, book_id):
     '''
     参数: book_id: 图书的ID号
     返回: 如果id存在,返回新一章的url地址
           如果id不存在,返回None
     '''
     searchid = SpiderMongo()
     result = searchid.is_book_id_in_mongodb(book_id)
     if result == None:
         return result
     else:
         return result['book_last_updata_url']