示例#1
0
 def main_parse(self, response):
     le = LinkExtractor(restrict_css='div.all_member table.sy_table')
     print(len(le.extract_links(response)))
     for link in le.extract_links(response):
         examples = ExampleItem()
         examples['url_examples'] = [link.url]
         yield examples
示例#2
0
 def parse(self, response):
     le = LinkExtractor(restrict_css='article')
     if le:
         for link in le.extract_links(response):
             yield scrapy.Request(link.url, callback=self.parse_page)
     le = LinkExtractor(restrict_css='li.next')
     links = le.extract_links(response)
     if links:
         next_url = links[0].url
         yield scrapy.Request(next_url, callback=self.parse)
示例#3
0
    def parse(self, response):
        le = LinkExtractor(restrict_css='article.product_pod h3')
        for link in le.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_book)

        le = LinkExtractor(restrict_css='ul.pager li.next')
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            yield scrapy.Request(next_url, callback=self.parse)
示例#4
0
    def parse(self, response):
        book_list = response.xpath("//ol[@class='row']//li")
        for books in book_list:
            book_message = BooksItem()
            book_message['book_title'] = books.xpath(
                './/h3/a/@title').extract_first()
            book_message['price'] = books.xpath(
                ".//div[@class='product_price']/p/text()").extract_first()
            book_message[
                'book_link'] = 'http://books.toscrape.com/' + books.xpath(
                    "./article/div/a/@href").extract_first()
            yield book_message
        '''
        # 使用selector提取下一页的链接
        next_url = response.xpath("//ul[@class='pager']/li[@class='next']/a/@href").extract()
        if next_url:
            next_page = response.urljoin(next_url[0])
            yield scrapy.Request(next_page, callback=self.parse)
        '''

        # 使用linkExtractor提取下一页的链接
        le = LinkExtractor(restrict_css='ul.pager li.next')
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            yield scrapy.Request(next_url, callback=self.parse)
示例#5
0
 def parse(self, response):
     pattern = '/gsschool/.+\.shtml'
     link = LinkExtractor(allow=pattern)
     links = link.extract_links(response)
     print(type(links))
     for link in links:
         print(link)
示例#6
0
    def parse(self, response):
        body = Selector(text=response.body)
        images = body.css('img.image-section__image').extract()
        result = body.css(
            'img.image-section__image ::attr(src)').extract_first()

        #yield {'img_url': result}
        yield ImagecrawlerItem(file_urls=[result])

        if len(os.listdir('/work/imagecrawler/output/full')) > 0:
            for name in os.listdir('/work/imagecrawler/output/full'):
                temp = name.find('?')
                new_name = name[:temp]
                shutil.copy('/work/imagecrawler/output/full/' + name,
                            '/work/imagecrawler/output/result/' + new_name)

        # body.css().extract() returns a list which might be empty
        #for image in images:
        #img_url = PexelsScraper.src_extractor.findall(image)[0]
        #tags = [tag.replace(',', '').lower() for tag in PexelsScraper.tags_extractor.findall(image)[0].split(' ')]
        #yield {'img_url': img_url}
        #yield {'img_tags': tags}

        link_extractor = LinkExtractor(allow=PexelsScraper.url_matcher)
        next_links = [
            link.url for link in link_extractor.extract_links(response)
            if not self.is_extracted(link.url)
        ]
        #yield {'next_links': next_links}
        for link in next_links:
            yield scrapy.Request(link, self.parse)
示例#7
0
 def parse(self, response):
     le = LinkExtractor(restrict_css='div.toctree-wrapper.compound',
                        deny='/index.html$')
     links = le.extract_links(response)
     print(len(links))
     print(links)
     for link in links:
         yield Request(link.url, callback=self.parse_example)
示例#8
0
 def parse(self, response):
     link = LinkExtractor(restrict_xpaths="//ul[@class='cont_xiaoqu']//li")
     links = link.extract_links(response)
     for link_line in links:
         print(link_line.url,link_line.text)
         item = LinkdemoItem()
         item["url"] = link_line.url
         item["text"] = link_line.text
         yield item
示例#9
0
    def parse(self, response):
        link = LinkExtractor(
            restrict_css=
            'body > div.wrap > div.middleright > div > div.cartoon_online_border > ul > li'
        )
        links = link.extract_links(response)
        # link1 = link.extract_links(response)[0]

        for link in links:
            yield Request(url=link.url, callback=self.parse2, dont_filter=True)
示例#10
0
 def parse(self, response):
     link_regulation = LinkExtractor(restrict_css='section')
     url_list = link_regulation.extract_links(response)
     if url_list:
         for link in url_list:
             url = link.url
             if 'page-' in url:
                 yield scrapy.Request(url, callback=self.parse)
             else:
                 yield scrapy.Request(url, callback=self.parse_detail)
示例#11
0
    def parse(self, response):
        body = Selector(text=response.body)
        images = body.css('img').extract()
        for image in images:
            image = image.encode("utf-8")
            if PexelsScraper.src_extractor.findall(image):
                img_url = PexelsScraper.src_extractor.findall(image)[0]
                if img_url not in PexelsScraper.crawled_urls:
                    if 'http' not in img_url:
                        print img_url
                        print self.start_urls[0]
                        print PexelsScraper.domain_extractor.findall(
                            self.start_urls[0])
                        img_url = PexelsScraper.domain_extractor.findall(
                            self.start_urls[0])[0][0] + img_url
                        print img_url
                    PexelsScraper.crawled_urls.add(img_url)
                    tags = ""
                    img_name = ""
                    img_type = ""
                    if PexelsScraper.tags_extractor.findall(image):
                        tags = PexelsScraper.tags_extractor.findall(
                            image)[0].replace(',', '').lower()
                    print img_url, tags
                    if '/' in img_url and len(
                            PexelsScraper.filename_extractor.findall(
                                img_url)) > 0:
                        img_name = PexelsScraper.filename_extractor.findall(
                            img_url)[0][0]
                        img_type = PexelsScraper.filename_extractor.findall(
                            img_url)[0][1]
                        print img_name
                    data = requests.get(img_url).content
                    im = Image.open(BytesIO(data))
                    width, height = im.size
                    # PexelsScraper.image_width = im.size[0]
                    # PexelsScraper.image_height = im.size[1]
                    img_aspect_ratio = self.calculate_aspect(width, height)
                    yield ImagecrawlerItem(source_url=response.url,
                                           img_url=img_url,
                                           alternate_text=tags,
                                           img_width=width,
                                           img_height=height,
                                           img_name=img_name,
                                           img_type=img_type,
                                           img_aspect_ratio=img_aspect_ratio)

        link_extractor = LinkExtractor()
        next_links = [
            link.url for link in link_extractor.extract_links(response)
            if not self.is_extracted(link.url)
        ]
        # Crawl the filtered links
        for link in next_links:
            yield scrapy.Request(link, self.parse)
示例#12
0
    def parse(self, response):
        link_extractor = LinkExtractor(allow=RotaractSpider.url_matcher)
        links = [link.url for link in link_extractor.extract_links(response)]

        for link in links:
            flag = True
            article_links = []
            yield scrapy.Request(url=link,
                                 callback=self.parse_articles,
                                 meta={
                                     'article_links': article_links,
                                     'flag': flag
                                 })
示例#13
0
 def parse(self, response):
     link = LinkExtractor(
         deny='/fang1/a2/',
         restrict_xpaths=
         '//div[@class="f-filter f-w1190"]//dd[@class="info"]/div[@class="thr-list"]//li[@class="item"]/a'
     )
     links = link.extract_links(response)
     for i in links:
         city_name = re.split('\/', i.url)[-3]
         yield Request(i.url,
                       callback=self.get_index,
                       meta={
                           'city_name': city_name,
                           'dont_redirect': True
                       },
                       dont_filter=True)
示例#14
0
文件: ahthor.py 项目: zybk01/ppython
    def callload(self,response):

        link = LinkExtractor(restrict_xpaths='//*[@cellspacing="1"]//a')
        link = link.extract_links(response)
        for urllist in link:
            url=urllist.url
            if url in self.loaded:
                pass
            else:
                self.loaded.append(url)

                request = scrapy.Request(url, callback=self.parse,
                                         headers={'User-Agent': 'Mozilla/5.0'},
                                         dont_filter=True)
                path = self.path + '/'+urllist.text
                request.meta['item'] = path
                yield request
            time.sleep(2)
示例#15
0
 def get_index(self, response):
     city_name = response.meta['city_name']
     link = LinkExtractor(
         allow='/fang1/.*htm',
         restrict_xpaths=
         '//div[@class="f-main f-clear f-w1190"]//div[@class="f-main-list"]/div[@class="f-list js-tips-list"]/div'
     )
     links = link.extract_links(response)
     for i in links:
         city = re.split('\/|\.', i.url)[2]
         yield Request(i.url,
                       callback=self.get_message,
                       meta={
                           'city': city,
                           'city_name': city_name,
                           'dont_redirect': True
                       },
                       dont_filter=True)
示例#16
0
    def parse(self, response):
        USER = True
        next_links = []
        body = Selector(text=response.body)
        images = body.css('img.photo-item__img').extract()
        for image in images:
            img_url = PexelsScraper.src_extractor.findall(image)[0]
            tags = [
                tag.replace(',', '').lower() for tag in
                PexelsScraper.tags_extractor.findall(image)[0].split(' ')
            ]
            print("Tags_check: ")
            print tags
        link_extractor = LinkExtractor(allow=PexelsScraper.url_matcher)
        next_links = [
            link.url for link in link_extractor.extract_links(response)
            if not self.is_extracted(link.url)
        ]  # Crawl the filtered links
        next_page_url = response.css(
            'div.pagination a[rel="next"]::attr(href)').extract_first()
        if next_page_url:
            next_page_url = URL + next_page_url
            next_links.append(next_page_url)
        print("next_page_url")
        print(next_page_url)
        if USER:
            links = response.css("a.pull-left::attr(href)").extract_first()
            print(links)
            if links:
                links = "https://www.pexels.com" + links
                for i in range(10):
                    next_links.append(links + "?page=" + str(i))
                print("go into user parse")
                #request.meta['main_url'] = URL
                #yield request
                for each in next_links:
                    yield scrapy.Request(each, self.parse_by_user)
                print("should have done user parse")
                print("Links_check: {}".format(links))

        for link in next_links:
            print("next_links")
            print link
            yield scrapy.Request(link, self.parse)
示例#17
0
 def parse(self, response):
     for info in response.css('.product_pod'):
         item = BooksItem()
         # print(info)
         item['name'] = info.css('h3>a::attr(title)').extract_first()
         # name = info.xpath('./h3/a/@title').extract_first()
         # print(name)
         item['price'] = info.css(
             '.product_price .price_color::text').extract_first()
         # price = info.xpath('//p[@class="price_color"]/text()').extract()
         # print(price)
         yield item
         bookstr = item['name'] + '\t' + item['price'] + '\n'
         self.f.write(bookstr)
     le = LinkExtractor(restrict_css='ul.pager li.next')
     links = le.extract_links(response)
     if links:
         next_url = links[0].url
         yield scrapy.Request(next_url, callback=self.parse)
     '''next_url = response.css('.pager .next>a::attr(href)').extract_first()
示例#18
0
    def parse(self, response):
        print response
        body = Selector(text=response.body)
        images = body.css('img.image-section__image').extract()
        print images

        # body.css().extract() returns a list which might be empty
        for image in images:
            img_url = Scraper.src_extractor.findall(image)[0]
            print img_url

        link_extractor = LinkExtractor(allow=Scraper.url_matcher)
        next_links = [
            link.url for link in link_extractor.extract_links(response)
            if not self.is_extracted(link.url)
        ]

        # Crawl the filtered links
        for link in next_links:
            yield scrapy.Request(link, self.parse)
示例#19
0
 def parse(self, response):
     link = LinkExtractor(restrict_xpaths=self.restrict_xpaths)
     links = link.extract_links(response)
     for i in links:
         self.item['link'] = i.url
         self.item['text'] = i.text
         self.item['date'] = datetime.datetime.now().strftime(
             '%Y-%m-%d %H:%M:%S')
         self.item['status'] = 0
         _rule = self.rule
         print(self.rule)
         _rule['start_urls'] = [i.url]
         _rule['name'] = self.next_name
         _rule['step'] = self.next_step
         self.item['rule'] = _rule
         print(_rule)
         backend = RedisBackend(REDIS_CONF)
         backend.send('%s_%s' % (REDIS_KEY, self.next_name), str(self.item))
         # print self.item
         yield self.item
示例#20
0
    def parse(self, response):
        def get_links(obj):
            if obj:
                for link in obj:
                    if link not in data:
                        data.append({
                            "referer":
                            response.request.headers.get('Referer', None),
                            "url":
                            link
                        })
            return

        if "text" not in str(response.headers["Content-Type"]):
            get_links([response.url])
        else:
            body = Selector(text=response.body)

            pdfs = body.css('a[href$=".pdf"]::attr(href)').extract()
            csvs = body.css('a[href$=".csv"]::attr(href)').extract()
            xl1 = body.css('a[href$=".xls"]::attr(href)').extract()
            xl2 = body.css('a[href$=".xlsx"]::attr(href)').extract()
            doc1 = body.css('a[href$=".doc"]::attr(href)').extract()
            doc2 = body.css('a[href$=".docx"]::attr(href)').extract()
            link_objs = [pdfs, csvs, xl1, xl2, doc1, doc2]

            link_extractor = LinkExtractor(allow=self.allowed_domains)
            next_links = [
                link.url for link in link_extractor.extract_links(response)
                if not self.is_extracted(link.url)
            ]

            # Crawl the filtered links
            for link in next_links:
                yield scrapy.Request(link, self.parse)

            [get_links(obj) for obj in link_objs]
示例#21
0
    def parse(self, response):
        link = LinkExtractor(restrict_xpaths=self.restrict_xpaths)
        links = link.extract_links(response)
        for i in links:
            self.item['link'] = i.url
            self.item['text'] = i.text
            self.item['date'] = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            self.item['status'] = 0
            _rule = self.rule
            _rule['start_urls'] = [i.url]
            _rule['name'] = self.next_name
            _rule['step'] = self.next_step
            self.item['rule'] = _rule
            print(self.rule)

            self.item['appname'] = response.xpath(
                self.rule['app_name']).extract()[0]
            downloadnumber = response.xpath(
                self.rule['downloadnumber']).extract()[0]
            self.item['downloadnumber'] = re.findall('\d+', downloadnumber)[0]
            downloadapp(i.url, _rule, self.item)
            # print self.item
            yield self.item
示例#22
0
    def parse(self, response):
        body = Selector(text=response.body)
        images = body.css('img.image-section__image').extract()
        # images is lists included of all tags of a class named 'img.image-section__image'

        # body.css().extract returns a list which might be empty
        for image in images:
            img_url = PexelsScraper.src_extractor.findall(image)[0]
            tags = [
                tag.replace(',', '').lower() for tag in
                PexelsScraper.tags_extractor.findall(image)[0].split(' ')
            ]
            img_type = str(img_url.split('/')[-1].split('.')[1].split('?')[0])

            # Set the full_path of a image file
            img_fullname = Global.img_name + str(Global.img_count) + img_type
            img_fullpath = Global.img_path + img_fullname
            # Download the image
            response = requests.get(img_url)
            with open(img_fullpath + '.' + img_type, 'wb') as f:
                f.write(response.content)
            del response

            # Print the result to the console
            print(img_fullname, img_url, tags)
            Global.img_count = Global.img_count + 1

        link_extractor = LinkExtractor(allow=PexelsScraper.url_matcher)
        next_links = [
            link.url for link in link_extractor.extract_links(response)
            if not self.is_extracted(link.url)
        ]

        # Crawl the filtered links
        for link in next_links:
            yield scrapy.Request(link, self.parse)
示例#23
0
文件: ahthor.py 项目: zybk01/ppython
    def parse(self, response):
        # try :
        self.crawled.append(response.url)
        path=response.meta['item']

        # except Exception as e:
        #     print(e)
        font = response.xpath('//font/text()[3]').extract()[0]
        title=response.xpath('/html/head/title/text()').extract()[0]
        title = re.findall(r'(.*?) -', title)[0]
        title = title.replace(':','-')
        title = title.replace(':', '-')
        title = title.strip()
        title=title.strip('\t')
        if self.signal=='y':
            path=path
            repath = path
        else:
            path=path+'/'+title
            repath = path
            self.signal='y'
        # repath = path
        localnumm=[]
        localnumm.append(title)

        if font == '>文章内容':
            path=repath + '/' + title
            localnumm.append(path)
            isExists = os.path.exists(path)
            if not isExists:
                os.makedirs(path)
        link = LinkExtractor(restrict_xpaths='//*[@cellspacing="1"]//a')
        link = link.extract_links(response)
        number = 0
        # for sel in link:
        #     if sel.url in self.loaded:
        #         pass
        #     else:
        #         self.loaded.append(sel.url)
        #         number = number + 1
        localnumm.append(response.url)
        localnumm.append(number)

        for sel in link:
            if sel.url in self.loaded:
                pass
            else:
                self.loaded.append(sel.url)
                number=number+1
                localnumm[3] = number
                self.i=self.i+1

                # print(sel.url)
                titem=transitem()
                titem['repath']=repath
                titem['path']=path
                titem['list']=localnumm
                request=scrapy.Request(sel.url, callback=self.download_parse,
                                     headers={'User-Agent': 'Mozilla/5.0'},
                                     dont_filter=True)
                request.meta['item']=titem
                yield request

        self.numm.append(localnumm)
示例#24
0
 def parse(self, response):
     link = LinkExtractor(restrict_xpaths='//ul[@class="list"]/li')
     links = link.extract_links(response)
     print(links)
     print(link)
示例#25
0
 def parse_index(self, response):
     link = LinkExtractor(allow=r'2015/ssy/[a-z]{1,}(.htm)')
     links = link.extract_links(response)
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse_category)
示例#26
0
 def parse(self, response):
     link = LinkExtractor(allow=r'http://www.nncc626.com/[a-z]{1,}(.htm)',
                          deny=r'index')
     links = link.extract_links(response)
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse_index)
示例#27
0
 def parse(self, response):
     le = LinkExtractor(restrict_css='div.sphx-glr-thumbcontainer p.caption'
     	        ,deny='/index.html$')
     for link in le.extract_links(response):
     	yield scrapy.Request(link.url, callback=self.page_parser)
示例#28
0
    def parse(self, response):

        # def get_proxy():
        #     return requests.get("http://127.0.0.1:5010/get/").content

        #
        # def delete_proxy(proxy):
        #     requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))

        #或者可以设置随机ip
        #不需要在这里设置,在retry中间件中设置即可
        #轮询使用ip,假设有500可用ip,一分钟500个页面,对服务器来说相当于每台主机访问一页面

        # while response.status == 403 or response.status == 302:
        #
        #     print(response.status)
        #
        #     print(response.meta)
        #
        #     # delete_proxy(response.headers)
        #
        #     #删除proxy
        #
        #     # 获取proxy
        #
        #     proxy = get_proxy()
        #
        #     print("使用新代理:" + str(proxy))
        #
        #     #如果proxy_pool耗尽,暂时暂停爬虫或者更换目标网站,移动端或者wap,或者各大网站的cache
        #
        #     response = scrapy.Request(url=response.url, meta={'proxy':'http://' + str(proxy)})
        #
        #     print(type(response))

        # print("有respose")

        item = LearningItem()

        #爬取书名

        #作者有联合作者,会和译者一样放在一个span里面,单个作者单独放在文本为 作者 的span 的后面的同级a节点,所以也要分类讨论
        #或者作者无链接——不会,会有search
        #单个作者也会用一组嵌套的span括住
        #翻译者的链接也是author,既然是爬取图书,就没有关系了,如果要研究翻译相关的话,主数据库有译者字段
        def is_exist(item_argv, xpath1, **xpath2):
            # item[item_argv] = info.xpath(xpath1).extract().strip()
            try:
                item[item_argv] = info.xpath(xpath1).extract()
            except:
                print(str(item_argv) + "出错")
                item[item_argv] = ''

            if len(item[item_argv]) == 1:

                item[item_argv] = item[item_argv][0].strip()

            # if len(item[item_argv]) == 0 and item[item_argv] != '':
            #
            #     item[item_argv] = ''

            # return item[item_argv][0].strip() if len(item[item_argv]) == 1 else item[item_argv]

            return item[item_argv]

        # try:
        #先确定豆瓣会出错的几种方式
        #返回403
        #返回200,但需登陆
        #返回此应用出错
        # print("尝试爬取")

        # except:
        # print()
        # print("被ban!!!!!!!!!!!!!")
        #只会停止其中一个协程,其他要逐渐停止,强行ctrl + z 会导致后面的链接被添加到filter中,以后都不会再被爬取
        if response.status != 200:

            #不知道会不会将缺少 '/"的页面重定向到别的地方,导致状态码变为301,改next_page的代码
            #shell后发现不会,重定向会直接返回200的response,服务器补全了后面的 /
            raise CloseSpider('强制停止!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            # time.sleep(600)
            # raise CloseSpider()
            # return
            ##这里写ADSL拨号或者换ip的逻辑
            # print()
            # return

        print("此时的URL为:" + str(response.url))
        # writer_link_list = []
        # series_link_list = []
        try:
            info = response.xpath(u'//*[@id="info"]')[0]
        except:
            raise CloseSpider("出现200以外的错误,此时的url为 %s" % response.url)

            #在这里一并处理了作者列表和翻译者列表

            #判断有无作者
            #判断有无翻译者
            #翻译者以上的author link 的text 加入到作者列表中
            #如无翻译者,则author link 的 text 默认为全是作者
            #容易出错,比如出现个志愿者什么的,举例而已

            #作者节点:作者节点的下一个同辈span节点的所有前同辈a节点,因为作者节点排第一,没有其他节点会影响它

            #先确定是两种模式的哪一种

            #直接写四种模式,用 a = b or c = d的写法,一句

        #如果以某个字段为基准,比如出版社以上的a tag 为作者,以下为翻译者的话,当出版社字段不存在,就会出错,所以还是以自身为基准,爬虫会更具健壮性
        #有冒号无嵌套
        w_name1 = info.xpath(
            u'//span[./text()="作者:"]/following-sibling::span[1]/preceding-sibling::a'
        )
        #有冒号有嵌套
        w_name2 = info.xpath(u'//span[./text()="作者:"]/parent::span/a')
        #无冒号无嵌套
        w_name3 = info.xpath(
            u'//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a'
        )
        #无冒号有嵌套
        w_name4 = info.xpath(u'//span[./text()=" 作者"]/parent::span/a')

        if w_name1:
            item['writers'] = w_name1.xpath("./text()").extract()
            item['writers_link'] = w_name1.xpath("./@href").extract()

        elif w_name2:
            item['writers'] = w_name2.xpath("./text()").extract()
            item['writers_link'] = w_name2.xpath("./@href").extract()

        elif w_name3:
            item['writers'] = w_name3.xpath("./text()").extract()
            item['writers_link'] = w_name3.xpath("./@href").extract()

        elif w_name4:
            item['writers'] = w_name4.xpath("./text()").extract()
            item['writers_link'] = w_name4.xpath("./@href").extract()

        else:
            item['writers'] = ''
            item['writers_link'] = ''

#————————————————————————————————————————————————————————————————————————————————————————————————————————————————#

#译者
# contains(@name,'na')

#有冒号无嵌套
        t_name1 = info.xpath(
            u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"search")]'
        )
        #有冒号有嵌套
        t_name2 = info.xpath(
            u'//span[./text()="译者:"]/following-sibling::a[contains(@href,"author")]'
        )
        #无冒号无嵌套
        #选中属性中包含某个字符串的href
        #链接可以直接爬取了,但是中文字段还是要靠后续的处理和提取

        #出错
        #仍有问题,无法替换和正确拼接
        # t_name3 = info.xpath(u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search") or contains(@href,"author")]')
        t_name3 = info.xpath(
            u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"search")]'
        )
        #无冒号有嵌套
        t_name4 = info.xpath(
            u'//span[./text()=" 译者"]/following-sibling::a[contains(@href,"author")]'
        )

        if t_name4:
            item['translators'] = t_name4.xpath("./text()").extract()
            item['translators_link'] = t_name4.xpath("./@href").extract()

        elif t_name3:
            item['translators'] = t_name3.xpath("./text()").extract()
            item['translators_link'] = t_name3.xpath("./@href").extract()

        elif t_name2:
            item['translators'] = t_name2.xpath("./text()").extract()
            item['translators_link'] = t_name2.xpath("./@href").extract()

        elif t_name1:
            item['translators'] = t_name1.xpath("./text()").extract()
            item['translators_link'] = t_name1.xpath("./@href").extract()
        else:
            item['translators'] = ''
            item['translators_link'] = ''

#————————————————————————————————————————————————————————————————————————————————————————————————————————————————#

        item["publish"] = is_exist(
            "publish", u'//span[./text()="出版社:"]/following::text()[1]')

        item["publish_date"] = is_exist(
            "publish_date", u'//span[./text()="出版年:"]/following::text()[1]')
        item["pages"] = is_exist(
            "pages", u'//span[./text()="页数:"]/following::text()[1]')
        item["price"] = is_exist(
            "price", u'//span[./text()="定价:"]/following::text()[1]')
        item["binding"] = is_exist(
            "binding", u'//span[./text()="装帧:"]/following::text()[1]')
        item["ISBN"] = is_exist(
            "ISBN", u'//span[./text()="ISBN:"]/following::text()[1]')
        item["orgin_name"] = is_exist(
            "orgin_name", u'//span[./text()="原作名:"]/following::text()[1]')
        item["series"] = is_exist(
            "series", u'//span[./text()="丛书:"]/following::a[1]/text()')
        item["series_link"] = is_exist(
            "series_link",
            u'//span[./text()="丛书:"]/following-sibling::a[1]/@href')

        # item["summary"] = is_exist("summary",)
        # item["w_summary"] = is_exist("w_summary",)

        item["catalog"] = is_exist("catalog",
                                   '//*[contains(@id,"dir_")]/text()')
        item["tag"] = is_exist("tag",
                               '//*[@id="db-tags-section"]/div/span/a/text()')
        item["series_info"] = is_exist(
            "series_info",
            '//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()'
        )

        # item["readers"] = is_exist("readers",).extract().strip()

        # item["title"] = is_exist("title",).extract().strip()
        # item["url"] = is_exist("url",).extract().strip()
        # item["score"] = is_exist("score",).extract().strip()

        try:
            item['title'] = response.xpath(
                "//*[@id='wrapper']/h1/span/text()").extract_first()
        except:
            item['title'] = ''

        item['url'] = response.url.replace("https://book.douban.com/subject/",
                                           "").strip('/')

        try:
            item['score'] = response.css(
                '#interest_sectl > div > div.rating_self.clearfix > strong::text'
            ).extract_first().strip()
            if item['score'] == '':
                item['score'] = '0'
        except:
            item['score'] = '0'

        # try:
        #     item['publish'] = info.xpath().extract_first().strip()
        # except:
        #     item['publish'] = ''
        # try:
        #     item['publish_date'] = info.xpath(u'//span[./text()="出版年:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['publish_date'] = ''

        # try:
        #     item['pages'] = info.xpath(u'//span[./text()="页数:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['pages'] = ''

        # try:
        #     item['price'] = info.xpath(u'//span[./text()="定价:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['price'] = ''
        # try:
        #     item['binding'] = info.xpath(u'//span[./text()="装帧:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['binding'] = ''
        # try:
        #     item['ISBN'] = info.xpath(u'//span[./text()="ISBN:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['ISBN'] = ''
        # try:
        #     item['orgin_name'] = info.xpath(u'//span[./text()="原作名:"]/following::text()[1]').extract_first().strip()
        # except:
        #     item['orgin_name'] = ''
        # try:
        #     item['series'] = info.xpath(u'//span[./text()="丛书:"]/following::a[1]/text()').extract_first().strip()
        # except:
        #     item['series'] = ''
        # try:
        #     item['series_link'] = info.xpath(u'//span[./text()="丛书:"]/following-sibling::a[1]/@href').extract_first().strip()
        # except:
        #     item['series_link'] = ''

        #这里有两种情况,一种有折叠,一种没有,先提取包含折叠内容的,没有再提取另一个

        try:

            summary = response.xpath(
                '//*[@id="link-report"]/span/div/div[@class="intro"]/p/text()')

            if summary:
                item['summary'] = summary.extract()
            else:
                item['summary'] = response.xpath(
                    '//*[@id="link-report"]/div[1]/div/p/text()').extract()

            # if len(item['summary']) == 0 and item['summary'] != '':
            #
            #     item['summary'] = ''

        except:

            item['summary'] = ''

        try:
            w_summary = response.css(
                '#content > div > div.article > div.related_info > div:nth-child(4) > span.all.hidden > div > p::text'
            )

            if w_summary:
                item['w_summary'] = w_summary.extract()
            else:
                item['w_summary'] = response.css(
                    '#content > div > div.article > div.related_info > div:nth-child(4) > span.short > div > p::text'
                ).extract()

            # if len(item['w_summary']) == 0 and item['w_summary'] != '':
            #
            #     item['w_summary'] = ''
        except:
            item['w_summary'] = ''

        # try:
        #     #出错
        #     # item['catalog'] = response.xpath('//*[contains(@id,"full") and contains(@id,"dir")]/text()').extract()
        #     item['catalog'] = response.xpath('//*[contains(@id,"dir_")]/text()').extract()
        # except:
        #     item['catalog'] = ''

        # try:

        #     item['tag'] = response.xpath('//*[@id="db-tags-section"]/div/span/a/text()').extract()
        # except:
        #     item['tag'] = ''

        # try:
        #     #丛书信息会随机抽取
        #     item['series_info'] = response.xpath('//*[@id="content"]/div/div[1]/div[3]/div[@class="subject_show block5"]/div//text()').extract()
        # except:
        #     item['series_info'] = ''

        try:
            item['readers'] = response.css(
                '#interest_sectl > div > div.rating_self.clearfix > div > div.rating_sum > span > a > span::text'
            ).extract_first()

            if item['readers'] is None:
                item['readers'] = '0'
        except:
            item['readers'] = '0'

        # '//*[@id="link-report"]/div[1]/div/p'/div/div[@class="intro"]/p/text()

        # if w_name_mode1:
        #     # w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","")
        #     w_name = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()')

        #     #如果能捕获作者名字,则写入,否则,为span嵌套模式
        #     if w_name:
        #         item['writer'] = w_name.extract()

        #     else:
        #         item['writer'] = w_name_mode1.xpath('./following-sibling::span[1]/preceding-sibling::a/text()')

        #     /
        #     writer_name_type2 = links.xpath('//span[./text()=" 作者"]/following-sibling::span[1]/preceding-sibling::a/text()').extract_first().replace("\n","").replace(" ","")
        #     writer_name_type3 =
        #     #单个作者节点已经完成,需要完成一组的作者节点,具体参考大学教材
        #     #一组作者节点同一组翻译者节点
        #     #翻译者节点:翻译者节点的下一个span节点

        #     #一组翻译者的已经解决,单个翻译者的参考傅雷

        #     # link_extract = item.extract()
        #     if "author" in link:
        #         # print(item.xpath('./@href').extract())
        #         #这里可以缩减
        #         writer_link_list.append(link)
        #     #存储完整的网址,日后爬取可以少一个拼接网址的逻辑,加快爬取速度,硬盘开销不大
        #     if "search" in link:
        #         link = "https://book.douban.com/" + link
        #         writer_link_list.append(link)

        #     if "series" in link:
        #         series_link_list.append(link)

        # item['writer_link'] = writer_link_list
        # item['series_link'] = series_link_list
        #         # item['writer'] = response.xpath(u'//span[./text()="作者:"]/following::a[2]')
        # # # // *[ @ id = "info"] / a[1]
        # # item['publish'] = response.xpath(u'//span[./text()="出版社:"]/following::text()[1]')
        # # item['orgin_name'] = response.xpath(u'//span[./text()="原作名:"]/following::text()[1]')

        # #这里只是其中一种情况,还有一种,要增加对应的try...except,以及中文图书没有翻译的问题,全半角符号的问题

        # c = ""#单个翻译者

        # try:
        #     if a:
        #         item['translator'] = a[0].xpath('./a/text()').extract()
        #     if b:
        #         item['translator'] = b[0].xpath('./a/text()').extract()
        # except:
        #     item['translator'] = ''

        #有效评分人数
        # if item['readers']:

        #     v = int(item['readers'])

        # else:
        #     v = 0

        # #入选top250的最低人数
        # m = 10000

        # #书本得分
        # if item['score']:
        #     R = float(item['score'])
        # else:
        #     R = 0

        # # C是所有书本的得分平均分,都存在数据库中,取个大概值就行了
        # C = 7

        item["weighting"] = 0
        item['seen'] = 0

        yield item

        # item['p_date']
        # item['total_pages']
        # item['price']
        # item['binding']
        # item['series']
        # item['ISBN']
        # item['summary']
        # item['w_introduce']
        # item['ca']
        # item['tag']
        # item['s_info']
        # item['score']
        # item['readers']
        # print(item['title'])
        # all = response.xpath("string(//*[@id='info'])")
        # all =
        # print(all.extract())
        # print(all.extract()[0].replace("\n",""))
        # print(all.extract()[0].replace("\n","").replace(" ",""))
        # print(type(all.extract()))
        # yield item
        #id一般固定,可以忽略css的变化
        #先不清洗,换取爬取的速度提升
        # all = response.xpath('//*[@id="info"]')
        # all = all.extract()[0].replace("\n","").replace("\t","").split("<br>")
        # for item in all:
        # print(item.replace('<spanclass="pl">',"").replace("</span>","").replace("""<divid="info"class="">""","").replace("</div>","").replace("</a>","").replace("""<aclass=""href=""","").replace("<span>","").replace("<ahref=",""))
        # all = response.xpath(u'//span[./text()=" 作者"]/following::text()')
        # print(all)

        #mysql批量写入,不要每次写入

        #
        #抽取"喜欢这本书的用户也喜欢"的链接
        link = LinkExtractor(
            restrict_xpaths=('//*[@id="db-rec-section"]/div//dl//dd'))
        links = link.extract_links(response)

        #如果链接是直接相关的话,也可以用response.follow,会返回一个url实例,然后可以yield相关的url:
        # links = response.xpath('//*[@id="db-rec-section"]/div//dl//dd').extract()

        # for link in links:
        #     yield response.follow(link,callback=self.parse)

        for link in links:
            # print("弹出一个url")

            # if link.url.endswith('/'):
            # pass
            # else:
            # link.url = link.url + "/"
            #没有"/"作为结尾的话,网址会重定向,不必要,但是可能是识别爬虫的依据
            yield scrapy.Request(url=link.url, callback=self.parse)
示例#29
0
 def parse(self, response):
     link = LinkExtractor(restrict_css='div.toctree-wrapper.compound',
                          deny='index.html$')
     for link in link.extract_links(response):
         yield scrapy.Request(url=link.url, callback=self.file_parse)
示例#30
0
 def mvlink(self, response):
     links_rule = LinkExtractor(allow='/movie/\d+')
     links = links_rule.extract_links(response)
     for i in links:
         sleep(1)
         yield Request(i.url, callback=self.neirong)