Exemplo n.º 1
0
 def get_chapterurl(self, response):
     # print(response.text)
     item = DingdianItem()
     # # response.meta[key]:这个是提取从上一个函数传递下来的值。
     # item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['name'] = response.meta['name']
     item['novelurl'] = response.meta['url']
     soup = BeautifulSoup(response.text, 'lxml')
     # html里的meta, 获取方式,不能用find,find_all
     """
     <meta name="og:novel:category" content="玄幻魔法"/> 
     <meta name="og:novel:author" content="横扫天涯"/> 
     <meta name="og:novel:book_name" content="天道图书馆"/> 
     <meta name="og:novel:read_url" content="https://www.x23us.com/html/67/67025/"/>  
     """
     category = soup.find(attrs={'name': 'og:novel:category'})['content']
     author = soup.find(attrs={'name': 'og:novel:author'})['content']
     bash_url = soup.find(attrs={'name': 'og:novel:read_url'})['content']
     # bash_url = BeautifulSoup(response.text, 'lxml').find('p', class_='btnlinks').find('a', class_='read')['href']
     name_id = str(response.url)[-6:-1].replace('/', '')
     item['category'] = str(category).encode('UTF-8')
     item['author'] = str(author)
     item['name_id'] = name_id
     # 返回item是不能用return的哦!用了就结束了,程序就不会继续下去了,得用yield
     yield item
     yield Request(url=bash_url, callback=self.get_chapter, meta={'name_id': name_id})
Exemplo n.º 2
0
    def get_chapterurl(self, response):
        print("==========get_chapterurl===============")
        item = DingdianItem()
        chaptSoup = BeautifulSoup(response.text, 'lxml')

        item['name'] = response.meta['name'].replace(chr(0xa0), '')
        item['author'] = chaptSoup.find('table').find_all(
            'td')[1].get_text().replace(chr(0xa0), '')
        item['novelurl'] = response.meta['url']
        item['serialnumber'] = chaptSoup.find('table').find_all(
            'tr')[1].find_all('td')[1].get_text().replace(chr(0xa0), '')
        item['serialstatus'] = chaptSoup.find('table').find_all(
            'tr')[0].find_all('td')[2].get_text().replace(chr(0xa0), '')
        item['category'] = chaptSoup.find('table').find('a').get_text()

        #latestChapterUrl = chaptSoup.find('p', class_='btnlinks').find('a', class_="read")['href']
        #item['novel_id'] = latestChapterUrl[-16:-11] 当id位数不一样时,有问题
        item['novel_id'] = int(
            re.findall(r"xiaoshuo/(.+?).html", item['novelurl'])[0])

        # 如果只需要实现以上的功能,必须有return,否则执行不到pipelines.py
        # return item

        yield item
        latestChapterUrl = chaptSoup.find('p', class_='btnlinks').find(
            'a', class_="read")['href']
        yield Request(url=latestChapterUrl,
                      callback=self.get_chapter,
                      meta={"novel_id": item['novel_id']})
Exemplo n.º 3
0
 def parse_books(self, response):
     #解析书籍概述等首页
     item = DingdianItem()
     item['category'] = response.xpath(
         '//tr/td[1]/a/text()').extract_first()  #小说类别
     item['book_author'] = response.xpath(
         '//tr[1]/td[2]/text()').extract_first().strip()  #小说作者去掉前后空格
     item['book_id'] = re.findall(r'/[0-9]+', response.url)[0][1:]
     item['book_name'] = response.xpath(
         '//*[@id="content"]/dd[1]/h1/text()').extract_first()[:-4]  #小说名字
     item['book_status'] = response.xpath(
         '//tr[1]/td[3]/text()').extract_first().strip()  #小说状态
     item['book_url'] = response.url
     item['clicks'] = response.xpath(
         '//tr[3]/td[1]/text()').extract_first().strip()  #总点击数
     item['recommend'] = response.xpath(
         '//tr[4]/td[1]/text()').extract_first().strip()  #总推荐数
     item['book_img_url'] = response.xpath(
         '//*[@id="content"]/dd[2]/div[1]/a/img/@src').extract_first()
     item['summary'] = response.xpath('//dd/p[2]').extract_first()  #简介
     item['length'] = response.xpath(
         '//tr[2]/td[2]/text()').extract_first().strip()  #总字数
     item['latest_update_time'] = response.xpath(
         '//tr[2]/td[3]/text()').extract_first().strip()  #最后更新时间
     item['flag'] = 1
     catalog = response.xpath(
         '//div/p[2]/a[1]/@href').extract_first()  #书籍目录
     url_md5 = hashlib.md5(item['book_url'].encode('gb2312')).hexdigest()
     if self.r.sadd('books', url_md5):
         yield item
     yield scrapy.Request(url=catalog, callback=self.parse_catalog)
Exemplo n.º 4
0
    def parse_details(self, response):
        book_name = response.xpath('//h1/text()').extract_first()
        book_anthor = response.xpath(
            '//table[@id="at"]/tr[1]/td[2]/text()').extract_first()
        book_type = response.xpath(
            '//table[@id="at"]/tr[1]/td[1]/a/text()').extract_first()
        book_status = response.xpath(
            '//table[@id="at"]/tr[1]/td[3]/text()').extract_first()
        book_words = response.xpath(
            '//table[@id="at"]/tr[2]/td[2]/text()').extract_first()
        book_time = response.xpath(
            '//table[@id="at"]/tr[2]/td[3]/text()').extract_first()
        book_click_nums = response.xpath(
            '//table[@id="at"]/tr[3]/td[1]/text()').extract_first()

        item = DingdianItem()
        item['book_name'] = book_name
        item['book_anthor'] = book_anthor
        item['book_status'] = book_status
        item['book_type'] = book_type
        item['book_words'] = book_words
        item['book_time'] = book_time
        item['book_click_nums'] = book_click_nums

        yield item
Exemplo n.º 5
0
 def get_chapterurl(self, response):
     # 实例化
     item = DingdianItem()
     # \xao是为了替换空字符&nbsp
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['url']
     all_start = BeautifulSoup(response.text, 'lxml').find('table', id='at')
     category = all_start.find('a').get_text()
     author = all_start.find_all('td')[1].get_text().replace('\xa0', '')
     chapter_list_url = BeautifulSoup(response.text,
                                      'lxml').find('a',
                                                   class_='read')['href']
     # http://www.23us.com/book/ = 25 得到尾部数字
     length = len(item['novelurl']) - 25
     name_id = item['novelurl'][-length:]
     item['category'] = category
     item['author'] = author
     item['name_id'] = name_id
     yield item
     yield Request(chapter_list_url,
                   callback=self.get_chapter,
                   meta={
                       'name_id': name_id,
                       'chapter_list_url': chapter_list_url
                   })
Exemplo n.º 6
0
 def get_name(self, response):
     url_node = response.xpath("//tr[@bgcolor='#FFFFFF']")
     for node in url_node:
         item = DingdianItem()
         item['novel_name'] = node.xpath(".//a/@title").extract()[0]
         item['novel_url'] = node.xpath(".//a/@href").extract()[1]
         print(item)
         #print(novel_url)
         yield item
Exemplo n.º 7
0
    def get_novelcontent(self, response):
        #targentcontent=response.meta['targentcontent']
        #print targentcontent['novelurl'],targentcontent['name']
        #title = response.xpath('//dd[1]/h1/text()').extract_first()
        novel_instroduce_url = response.url  #小说地址
        novel_name = response.meta['name']  #小说名字
        chapterlisturl = response.meta['chapterlisturl']  #章节列表地址
        author = response.xpath(
            '//table/tr[1]/td[2]/text()').extract_first()  #作者
        serialstatus = response.xpath(
            '//table/tr[1]/td[3]/text()').extract_first()  #状态
        serialnumber = response.xpath(
            '//table/tr[2]/td[2]/text()').extract_first()  #连载字数
        category = response.xpath(
            '//table/tr[1]/td[1]/a/text()').extract_first()  #小说类别
        name_id = chapterlisturl.split('/')[-1]  #小说编号
        collect_num_total = response.xpath(
            '//table/tr[2]/td[1]/text()').extract_first()  #总收藏
        click_num_total = response.xpath(
            '//table/tr[3]/td[1]/text()').extract_first()  #总点击

        #chapterlistul=response.xpath('//dd[2]/div[2]/p[2]/a/text()').extract_first()
        #chapterlisturl=response.xpath('//dd[2]/div[2]/p[2]/a/@href').extract_first()
        novel_breif = response.xpath('//dd[2]/p[2]').extract_first()
        '''
		print('novel_instroduce_url = %s' % novel_instroduce_url)
		print('chapterlisturl = %s' % chapterlisturl)
		print('author = %s' % len(author))
		print('serialstatus = %s' % len(serialstatus))
		print('serialnumber = %s' % len(serialnumber))
		print('category = %s' % len(category))
		print('name_id = %s' % name_id)
		print('collect_num_total = %s' % int(collect_num_total))
		print('click_num_total = %s' % int(click_num_total))
		'''

        targentcontent = DingdianItem()
        targentcontent['novel_name'] = novel_name
        targentcontent['author'] = author
        targentcontent['novel_instroduce_url'] = novel_instroduce_url
        targentcontent['novelurl'] = chapterlisturl
        targentcontent['serialstatus'] = serialstatus
        targentcontent['serialnumber'] = serialnumber
        targentcontent['category'] = category
        targentcontent['name_id'] = name_id
        targentcontent['collect_num_total'] = collect_num_total
        targentcontent['click_num_total'] = click_num_total
        targentcontent['novel_breif'] = novel_breif
        #print(u'novel_name=%s,author=%s,novel_instroduce=%s,serialstatus=%s,serialnumber=%s,category=%s,name_id=%s,collect_num_total=%s,click_num_total=%s,chapterlisturl=%s'  % (novel_name,author,novel_instroduce,serialstatus,serialnumber,category,name_id,collect_num_total,click_num_total,chapterlisturl))
        #yield targentcontent
        yield Request(chapterlisturl,
                      dont_filter=True,
                      callback=self.get_charaterurl,
                      meta={'targentcontent': targentcontent})
Exemplo n.º 8
0
	def get_chapterurl(self, response):
		item = DingdianItem()
		item['name'] = str(response.meta['name']).replace('\xa0', 'td')
		item['novelurl'] = response.meta['url']
		category = BeautifulSoup(response.text, 'lxml').find('table').find('a').get_text()
		author = BeautifulSoup(response.text, 'lxml').find('table').find('td')[1].get_text()
		bash_url = BeautifulSoup(response.text, 'lxml').find('p', class_='btnlinks').fine('a', class_='read')['href']
		name_id = str(bash_url)[-6:-1].replace('/','')
		item['category'] = str(category).replace('/','')
		item['author'] = str(author).replace('/','')
		item['name_id'] = name_id
		return item
Exemplo n.º 9
0
 def chapterurl(self, response):
     item = DingdianItem()
     item['name'] = response.meta['name']
     item['novelurl'] = response.meta['url']
     item['author'] = response.meta['author']
     item['category'] = BeautifulSoup(response.text, 'lxml').find('table', bgcolor='#E4E4E4').find('a').get_text()
     bash_url = BeautifulSoup(response.text, 'lxml').find('p', class_='btnlinks').find('a', class_='read')['href']
     name_id = str(bash_url)[-6:-1].replace('/', '')
     item['name_id'] = name_id
     # item['serialstatus'] = response.meta['status']
     yield item
     yield Request(bash_url, callback=self.get_chapter, meta={'name_id':name_id})
Exemplo n.º 10
0
 def get_chapturl(self, response):
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['url']
     category = BeautifulSoup(
         response.text, 'lxml').find('dt').find_all('a')[5].get_text()
     author_info = BeautifulSoup(response.text,
                                 'lxml').find('h3').get_text()
     name_id = ''.join(response.meta['url'].split('/')[-3:-1])
     item['category'] = category
     item['author'] = re.split(r'[:\xa0;]', author_info)[1]
     item['name_id'] = name_id
     return item
Exemplo n.º 11
0
 def get_chapterurl(self, response):
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['url']
     # category=BeautifulSoup(response.text,'lxml').find('table').find('a').get_text()
     td_list = BeautifulSoup(response.text,
                             'lxml').find('table').find_all('td')
     item['category'] = str(td_list[0].get_text()).replace('\xa0', '')
     item['author'] = str(td_list[1].get_text()).replace('\xa0', '')
     item['serialstatus'] = str(td_list[2].get_text()).replace('\xa0', '')
     item['serialnumber'] = str(td_list[4].get_text()).replace('\xa0', '')
     item['name_id'] = str(response.url)[-10:-5].replace('/', '')
     return item
Exemplo n.º 12
0
 def get_chapterurl(self, response):
     # response = requests.get(novelurl)
     soup = BeautifulSoup(response.text, 'lxml')
     item = DingdianItem()
     item['name'] = response.meta['name']
     item['novelurl'] = response.meta['url']
     item['name_id'] = response.url[-6:].replace('/', '')
     item['category'] = soup.find('table').find_all('td')[0].get_text()
     item['author'] = soup.find('table').find_all('td')[1].get_text()
     item['serialstatus'] = soup.find('table').find_all('td')[2].get_text()
     item['serialnumber'] = soup.find('table').find_all('td')[4].get_text()
     item['lastUpdate'] = soup.find('table').find_all('td')[5].get_text()
     return item
Exemplo n.º 13
0
 def get_chapterurl(self, response):
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['url']
     soup = BeautifulSoup(response.text, 'lxml')
     #meta 获取方式,不能用find,find_all
     category = soup.find(attrs={'name':'og:novel:category'})['content']
     author = soup.find(attrs={'name':'og:novel:author'})['content']
     name_id = str(response.url)[-6:-1].replace('/', '')
     item['category'] = str(category)
     item['author'] = str(author)
     item['name_id'] = name_id
     yield item
     yield Request( response.url, callback=self.get_chapter, meta={'name':name_id} )
Exemplo n.º 14
0
    def parse(self, response):


        for ms in response.xpath("//div[contains(@class,'i_w')]"):
            item = DingdianItem()
            title = ms.xpath("div/div/strong/text()").extract_first()
            hot = ms.xpath("div/div/span/text()").extract_first()
            item["title"] = title
            item["hot"] = hot
            yield item

        next_page = response.xpath("//a[@class='next']/@href").extract_first()
        print("下一页:", next_page)
        if next_page: yield scrapy.Request(response.urljoin(next_page), callback=self.parse)
 def get_chapterurl(self, response):
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['url']
     soup = BeautifulSoup(response.text, 'lxml')
     category = soup.find('table').find('a').get_text()
     author = soup.find('table').find_all('td')[1].get_text()
     base_url = soup.find('p',
                          class_='btnlinks').find('a',
                                                  class_='read')['href']
     name_id = str(base_url)[-6:-1].replace('/', '')
     item['category'] = str(category).replace('/', '')
     item['author'] = str(author).replace('/', '')
     item['name_id'] = str(name_id).replace('/', '')
     yield item  # 这个返回的是一个item的生成器,可以是用next(函数) 进行遍历
Exemplo n.º 16
0
 def content_html(self, response):
     item = DingdianItem()  # 引入定义存数据的item文件
     # 找到文章的标题
     title1 = response.xpath(
         '//*[@id="amain"]/dl/dd[1]/h1/text()').extract()[0]
     item['book'] = response.meta['title']  # 书名
     item['article_title'] = title1  # 文章的标题
     item['author'] = response.meta['author']  # 作者
     content_all = ''  # 定一个空字符串 用来接受你得到的数据
     # 文章的内容 返回一个列表 mysql不能直接存列表
     content_con = response.xpath('//*[@id="contents"]/text()').extract()
     for i in content_con:  # 遍历你的列表
         content_all = content_all + i.strip()  # 得到内容后 去除空格 加入到空字符串
     item['content'] = content_all  # 得到你的内容
     yield item
Exemplo n.º 17
0
 def get_chapter(self, response):
     item = DingdianItem()
     item['name'] = response.meta['novel_name']
     item['novelurl'] = response.meta['novel_url']
     print(item['name'], item['novelurl'])
     author = response.xpath('//div[@class="block_txt2"]/p/a').extract()[1]
     author = re.search(r'>(.*?)</a>', author).group(1)
     print(author)
     category = response.xpath('//div[@class="block_txt2"]/p/a').extract()[2]
     category = re.search(r'>(.*?)</a>', category).group(1)
     print(category)
     name_id = str(response.url.split(r'/')[-1])
     print(name_id)
     item['author'] = str(author)
     item['category'] = str(category)
     item['name_id'] = str(name_id)
     yield item
Exemplo n.º 18
0
 def get_chapterurl(self,response):
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['url']
     item['name_id'] = response.meta['name_id']
     bs = BeautifulSoup(response.text, 'lxml').find('table',id='at').find_all('td')
     category = bs[0].get_text()
     author = bs[1].get_text()
     serialstatus = bs[2].get_text()
     sn = td[4].get_text()
     serialnumber =re.split(r'\D+',str(sn))[1]
     item['category'] = str(category).replace('/', '')
     item['author'] = str(author).replace('/', '')
     item['serialstatus'] = str(serialstatus).replace('/', '')
     item['serialnumber'] = str(serialnumber)
     yield item
         yield Request(url=bash_url, callback=self.get_chapter, meta={'name_id': name_id})
Exemplo n.º 19
0
 def get_chapterurl(self, response):
     #response.encoding = 'gbk'
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novel_url'] = response.meta['url']
     category = BeautifulSoup(response.text, 'lxml').find(
         'div', style='width:550px;').find('a').get_text().strip()
     author = BeautifulSoup(response.text, 'lxml').find(
         'div', style='width:550px;').find_all('td')[1].get_text().strip()
     name_id = BeautifulSoup(response.text, 'lxml').find(
         'p', class_='btnlinks').find('a',
                                      class_='read')['href'][-6:-1].replace(
                                          '/', '')
     item['category'] = str(category)
     item['author'] = str(author)
     item['name_id'] = str(name_id)
     return item
Exemplo n.º 20
0
    def parse(self, response):
        lists = []  #  先建立一个列表,用来保存每一页的信息

        # 通过观察我们看到该页面所有影片的信息都位于一个class属性为list-unstyled vod-item-img ff-img-215的 ul 标签内的 li 标签内。
        movies = response.xpath(
            '//ul[@class="list-unstyled vod-item-img ff-img-215"]/li')
        for movie in movies:
            list = DingdianItem()  # 申请一个weatheritem 的类型来保存结果
            # 为什么要用.extract()[0],是因为.xpath 返回的是一个列表,我们是获取里面的内容
            list['name'] = movie.xpath(
                './/p[@class="image"]//img/@alt').extract()[0]
            list['img'] = movie.xpath(
                './/p[@class="image"]//img/@data-original').extract()[0]
            list['movie'] = 'http://nlook1.cn' + movie.xpath(
                './/p[@class="image"]/a/@href').extract()[0]
            lists.append(list)  # 添加到 lists 列表中
        return lists  # 一定要有这个返回 lists ,因为之后我们要将数据下载到本地,没有的话,就下载保存不了的
Exemplo n.º 21
0
 def get_charpter_url(self, response):
     item = DingdianItem()
     bs = BeautifulSoup(response.text, 'lxml')
     item['name'] = response.meta['name']
     item['novel_url'] = response.meta['url']
     item['category'] = bs.find('table').find('a').get_text()
     item['author'] = bs.find('table').find_all('td')[1].get_text().replace('\xa0', '')
     item['novel_id'] = bs.find('p', class_='btnlinks').find('a', class_='read')['href'][-6:-1].replace('/', '')
     item['serial_status'] = bs.find('table').find('tr').find_all('td')[-1].get_text().replace('\xa0', '')
     item['serial_length'] = bs.find('table').find_all('tr')[1].find_all('td')[1].get_text().replace('\xa0', '')
     bash_url = bs.find('p', class_='btnlinks').find('a', class_='read')['href']
     ret = Sql.select_name(item['novel_id'])
     if ret == 1:
         print("该小说已存在")
     else:
         yield item
     yield Request(bash_url, callback=self.get_chapter, meta={'novel_id': item['novel_id']})
Exemplo n.º 22
0
 def get_chapterurl(self, response):
     Item = DingdianItem()
     Item["name"] = str(response.meta["name"]).replace("\xa0", "")
     Item["novelUrl"] = response.meta["url"]
     htmlsoup = BeautifulSoup(response.text, "lxml")
     categroy = htmlsoup.find("table").find("a").get_text()
     author = htmlsoup.find("tr").find_all("td")[1].get_text()
     bash_url = htmlsoup.find("p",
                              class_="btnlinks").find("a",
                                                      class_="read")["href"]
     name_id = bash_url.split("/")[-2]
     Item["category"] = str(categroy).replace("\a0", "")
     Item["author"] = str(author).replace("\a0", "")
     Item["name_id"] = name_id
     yield Item
     yield Request(bash_url,
                   callback=self.get_chapter,
                   meta={"name": name_id})
Exemplo n.º 23
0
 def get_chapterurl(self, response):
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['url']
     category = BeautifulSoup(response.text,
                              'lxml').find('table').find('a').get_text()
     author = BeautifulSoup(
         response.text, 'lxml').find('table').find_all('td')[1].get_text()
     bash_url = BeautifulSoup(response.text, 'lxml').find(
         'p', class_='btnlinks').find('a', class_='read')['href']
     name_id = str(bash_url)[-6:-1].replace('//', '')
     item['category'] = str(category).replace('//', '')
     item['author'] = str(author).replace('\xa0', '')
     item['name_id'] = name_id
     yield item
     yield Request(url=bash_url,
                   callback=self.get_chapter,
                   meta={'name_id': name_id})
Exemplo n.º 24
0
 def get_chapterurl(self, response):
     item = DingdianItem()
     # 小说名称
     item['name'] = response.meta['name']
     # 小说详情页
     item['novelurl'] = response.meta['url']
     # 分类
     item['category'] = BeautifulSoup(
         response.text, 'lxml').find('table').find('a').get_text()
     # 作者
     item['author'] = BeautifulSoup(
         response.text,
         'lxml').find('table').find_all('td')[1].get_text().replace(
             "\xa0", '')
     # 最新章节
     item['new'] = BeautifulSoup(response.text, 'lxml').find(
         'p', class_='btnlinks').find('a', class_='read')['href']
     return item
Exemplo n.º 25
0
 def get_chapterurl(self, response):
     item = DingdianItem()
     item['name'] = response.meta['name']
     item['novelurl'] = response.meta['url']
     item['author'] = response.xpath(
         '//tr[1]/td[2]/text()').extract_first().strip()
     item['serialstatus'] = response.xpath(
         '//tr[1]/td[3]/text()').extract_first().strip()
     item['serialnum'] = response.xpath(
         '//tr[2]/td[2]/text()').extract_first().strip()
     item['category'] = response.xpath(
         '//tr[1]/td[1]/a/text()').extract_first().strip()
     find_name_id = re.compile('\/(\d+)\.', re.S)
     name_id = re.findall(find_name_id, response.meta['url'])[0]
     item['name_id'] = name_id
     #print(item['name'],item['author'],item['serialstatus'],item['serialnum'],item['category'])
     self.count += 1
     print(self.count)
     return item
Exemplo n.º 26
0
    def parse_detail(self, response):
        soup = BeautifulSoup(response.text)
        table = soup.find('table')
        trs = table.find_all('tr')
        auth = trs[0].find_all('td')[1].get_text().replace(
            '\xa0', '')  # \xa0 就是 html 中的 &nbsp;
        status = trs[0].find_all('td')[2].get_text().replace('\xa0', '')
        words = trs[1].find_all('td')[1].get_text().replace('\xa0', '')
        last_time = trs[1].find_all('td')[2].get_text().replace('\xa0', '')

        item = DingdianItem()
        item['book_name'] = response.meta['book_name']
        item['new_chapter'] = response.meta['new_chapter']
        item['auth'] = auth
        item['status'] = status
        item['words'] = words
        item['last_time'] = last_time

        return item
Exemplo n.º 27
0
 def get_chapterurl(self, response):
     # 创建个DingdianItem对象把我们爬取的东西放进去
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace(
         u'\xa0', u'')  #前置替换动作,因为unicode中的‘\xa0’字符在转换成gbk编码时会出现问题
     item['novelurl'] = response.meta['url']
     soup = BeautifulSoup(response.text, 'lxml')
     category = soup.table.a.get_text()
     author = soup.table.find_all('td')[1].get_text()
     bash_url = soup.find('p',
                          class_='btnlinks').find('a',
                                                  class_='read')['href']
     name_id = str(bash_url)[-6:-1].replace(u'/', u'')
     item['category'] = str(category).replace(u'/', u'')
     item['author'] = str(author).replace(u'/', u'')
     item['name_id'] = name_id
     yield item
     yield Request(bash_url,
                   callback=self.get_chapter,
                   meta={'name_id': name_id})
Exemplo n.º 28
0
    def get_chapterurl(self, response):
        item = DingdianItem()
        item['name'] = ((response.meta['name']).replace('\xa0', ''))
        item['novelurl'] = response.meta['url'].encode('utf-8')
        category = BeautifulSoup(response.text,
                                 'lxml').find('table').find('a').get_text()
        author = BeautifulSoup(
            response.text, 'lxml').find('table').find_all('td')[1].get_text()
        base_url = BeautifulSoup(response.text, 'lxml').find(
            'p', class_='btnlinks').find('a', class_='read')['href']
        # name_id = ((base_url)[-11:-5].replace('/', '')).encode('utf-8')
        pattern = re.compile(r'\d+')
        # name_id = pattern.findall(str(base_url))[1].encode('utf-8')
        name_id = pattern.findall(str(response.meta['url']))[1]

        item['category'] = ((category).replace('/', ''))
        item['author'] = (str(author).replace('\xa0', ''))
        item['name_id'] = name_id

        # return item
        yield item
Exemplo n.º 29
0
 def get_all(self, response):
     '''
     处理页面,匹配各项内容并返回item字典
     :param response: 
     :return: 
     '''
     item = DingdianItem()
     html = response.text
     name = BeautifulSoup(html, 'lxml').find('h1').get_text().split()[0]
     novelurl = BeautifulSoup(html, 'lxml').find('a', class_='read')['href']
     bs_table = BeautifulSoup(html, 'lxml').find('table')
     author = bs_table.find_all('td')[1].get_text().split()[0]
     status = bs_table.find_all('td')[2].get_text().split()[0]
     number = bs_table.find_all('td')[4].get_text().split()[0][:-1]
     category = bs_table.find_all('td')[0].get_text().split()[0]
     name_id = re.findall('down/(\d+)', html)[0]
     item['name'] = name
     item['author'] = author
     item['novelurl'] = novelurl
     item['status'] = status
     item['number'] = number
     item['category'] = category
     item['name_id'] = name_id
     return item
Exemplo n.º 30
0
 def get_chapterurl(self, response):
     item = DingdianItem()
     item['name'] = str(response.meta['name']).replace('\xa0', '')
     item['novelurl'] = response.meta['url']
     item['category'] = response.meta['cat']
     author = BeautifulSoup(response.text, 'lxml').find(
         id='info').find('p')
     author = str(author.get_text()).split(':')[1]
     item['author'] = author
     name_id = str(response.url)[-6:-1].replace('/', '')
     item['name_id'] = name_id
     yield item
     dds = BeautifulSoup(response.text, 'lxml').find_all('dd')
     num = 0
     for novel in dds:
         num = num + 1
         url = response.url + novel.find('a')['href']
         chapter_title = novel.find('a').get_text()
         rets = Sql.select_chapter(url)
         if rets[0] == 1:
             print('the chapter is exsits!')
             pass
         else:
             yield Request(url, callback=self.get_chapter, meta={'num': num, 'name_id': name_id, 'chaptername': chapter_title, 'chapterurl': url})