Python LxmlLinkExtractor 예제들, scrapy.linkextractors.lxmlhtml.LxmlLinkExtractor Python 예제들

예제 #1

0

파일 보기

파일: dgtle.py 프로젝트: zzzz123321/News_scrapy_redis

class Dgtle(RedisCrawlSpider):
    # 爬虫名
    name = "dgtle"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["dgtle.com"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'dgtle:start_urls'
    # start_urls = ['http://www.dgtle.com/']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/portal.php\?mod=list&catid=\d{2}', )),
             follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/article[\d|-]+\.html', )),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '/html/body/div[3]/h2/a/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '/html/body/div[3]/div/div[1]/i/text()').extract()[0].strip()
        item['content_code'] = response.xpath(
            '/html/body/div[4]/div[1]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #2

0

파일 보기

class WangyiNewsSpider(CrawlSpider):
    name = '163_news'

    allowed_domains = ['news.163.com']

    start_urls = ['http://news.163.com/']

    article_extract = LxmlLinkExtractor(
        allow=('/\d{2}/\d{4}/\d{2}/[a-zA-Z0-9_]+.html',
               'photoview/[a-zA-Z0-9]+/\d+.html', '/\d+/\d+/[A-Z0-9]+.html',
               '/photo/[A-Z0-9]+/\d+.html', '/\d+/\d/[a-zA-Z0-9_]+.html'),
        allow_domains=('news.163.com'))

    follow_extract = LxmlLinkExtractor(allow_domains=('news.163.com'))

    rules = (Rule(article_extract, follow=True, callback='parse_article'),
             Rule(follow_extract, follow=True, callback='parse_follow'))

    a_count = 0
    f_count = 0

    def parse_article(self, response):
        self.a_count += 1
        print('article:  ' + str(self.a_count) + '   ' + response.url)

        sel = Selector(response)
        # http://news.163.com/17/0117/14/CB07N4J4000187VE.html
        news_1_div = sel.xpath(
            '//div[@id="epContentLeft"]/div[@id="post_body"]')

    def parse_follow(self, response):
        self.f_count += 1
        print('follow:  ' + str(self.f_count) + '   ' + response.url)

예제 #3

0

파일 보기

class InfoQ(RedisCrawlSpider):
    # 爬虫名
    name = "infoq"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = [
        "infoq.com",
    ]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'infoq:start_urls'
    # start_urls = ['http://www.infoq.com/cn/']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'infoq\.com/cn/[a-z]+/.+', )),
             follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'infoq\.com/cn/news/\d{4}/\d{2}/.+', )),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="content"]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.url.split(
            '/')[-3] + '-' + response.url.split('/')[-2]
        item['content_code'] = response.xpath(
            '//*[@id="content"]/div[2]/div[1]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #4

0

파일 보기

파일: zaker.py 프로젝트: zzzz123321/News_scrapy_redis

class Zaker(RedisCrawlSpider):
    # 爬虫名
    name = "zaker"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["myzaker.com"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'zaker:start_urls'
    # start_urls = ['https://www.myzaker.com/']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/channel/[13|5|4|1039]')),
             follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/article/.+/', )),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//div[@id="content"]/div/div/div/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//div[@id="article"]/div[1]/div/a/span[3]/text()').extract(
            )[0].strip()
        item['content_code'] = response.xpath(
            '//div[@class="article_content"]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #5

0

파일 보기

파일: guokr.py 프로젝트: kerry456/News_scrapy_redis-master

class Guokr(RedisCrawlSpider):
    # 爬虫名
    name = "guokr"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = [
        "guokr.com",
    ]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'guokr:start_urls'
    # start_urls = ['http://www.guokr.com/scientific/']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/scientific/channel/[a-z]+/', )),
             follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/article/\d+/', )),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="articleTitle"]/text()').extract()[0].strip()
        item['pub_time'] = '2017-08-27'
        item['content_code'] = response.xpath(
            '//*[@id="articleContent"]/div/div[1]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #6

0

파일 보기

파일: techreview.py 프로젝트: zzzz123321/News_scrapy_redis

class Techreview(RedisCrawlSpider):
    # 爬虫名
    name = "techreview"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["technologyreview.com"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'techreview:start_urls'
    # start_urls = ['https://www.technologyreview.com/']


    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/topic/.+/', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/s/\d{6}/[a-z0-9|-]+/', )), callback='parse_item'),
    )


    def parse_item(self, response):
        item = NewsItem()

        item['url'] = response.url
        item['title'] = response.xpath('/html/body/main/section/article/div/div[1]/div[2]/div/div[1]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('/html/body/main/section/article/div/div[1]/div[2]/div/div[2]/ul/li[2]/text()').extract()[0].strip()
        item['content_code'] = response.xpath('/html/body/main/section/article/div/div[2]').extract()[0].strip()
        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #7

0

파일 보기

파일: mobilephone.py 프로젝트: danielzhu1201/zolmobile

class mobilephoneSpider(CrawlSpider):
    name="mobilephone"
    allowed_domains=["zol.com.cn"]
    start_urls=['http://detail.zol.com.cn/cell_phone_index/subcate57_list_1.html']
    
	rules=(
		Rule(
		    LxmlLinkExtractor(
		        allow=('detail.zol.com.cn/\d*?/\d*?/param.shtml',), //means only allow this webpage
		        deny=(),
		        ),
		    follow=False, //if links should be followed from each response extracted with this rule
		    process_links=lambda links:[link for link in links if not link.nofollow],
		    callback='parse'),
		Rule(
		    LxmlLinkExtractor(
		        allow=('detail.zol.com.cn/cell_phone/index\d*?.shtml',),
		        deny=(),
		        ),
		    follow=False,
		    process_links=lambda links:[link for link in links if not link.nofollow],
		    callback='parse_price'),
		Rule(
		    LxmlLinkExtractor(
		        allow=('/cell_phone_index/subcate57_\d*?_list_1[_\d]*?\.html',), //this format is wrong 
			// check this http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_2_0_1.html
		        deny=('digital','notebook','tablepc','gps','keyboards_mouse','desktop_pc',
		              'gpswatch','zsyxj','motherboard','vga','cpu','hard_drives','menmery',
		              'case','power','cooling_product','solid_state_drive','dvdrw','sound_card',
		              'diy_host','usb-hub','speaker','mb_chip'),
		        ),
		    follow=True,
		    process_links=lambda links:[link for link in links if not link.nofollow],
		    )
        )

예제 #8

0

파일 보기

파일: tmtpost.py 프로젝트: zzzz123321/News_scrapy_redis

class Tmtpost(RedisCrawlSpider):
    # 爬虫名
    name = "tmtpost"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = [
        "tmtpost.com",
    ]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = "tmtpost:start_urls"
    # start_urls = ['http://www.tmtpost.com/']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/column/\d+', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'tmtpost.com/\d+\.html', )),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//span[@class="time"]/text()').extract()[0][:10]
        item['content_code'] = response.xpath('//article/div[2]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #9

0

파일 보기

파일: ftchinese.py 프로젝트: zzzz123321/News_scrapy_redis

class FTchinese(RedisCrawlSpider):
    # 爬虫名
    name = "ftchinese"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["ftchinese.com",]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'ftchinese:start_urls'
    # start_urls = ['http://www.ftchinese.com/']


    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/channel/.+\.html', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/story/.+', )), callback='parse_item'),
    )


    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] =  response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/div[5]/span[1]/text()').extract()[0].strip()
        item['content_code'] = response.xpath('/html/body/div[5]/div/div[1]/div/div[1]/div[6]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #10

0

파일 보기

파일: centerwatch_spider.py 프로젝트: sgalletta213/drug-spending

class Centerwatch(CrawlSpider):
    name = 'centerwatch'

    allowed_domains = ['centerwatch.com']
    start_urls = [
        "http://www.centerwatch.com/drug-information/fda-approved-drugs/therapeutic-areas"
    ]

    rules = (
        Rule(
            LxmlLinkExtractor(
                restrict_xpaths=('.//li/a[contains(@id, "ctl00")]'))),
        Rule(LxmlLinkExtractor(
            restrict_xpaths=('//div[@id="ctl00_BodyContent_AreaDetails"]')),
             callback='parse_drug'),
    )

    def parse_drug(self, response):
        page = response.xpath('//div[@class="row"]')[3]
        summary_cols = page.xpath('.//div[@id="SummaryColumn"]/div/div/p')

        drug = Drug(
            name=page.xpath('.//h1/text()').extract_first(),
            company=summary_cols[1].xpath('./a/text()').extract_first(),
            approval_status=summary_cols[3].xpath('./text()').extract_first(),
            specific_treatment=summary_cols[5].xpath(
                './text()').extract_first(),
            therapeutic_areas=summary_cols[7].xpath('./a/text()').extract())

        yield drug

예제 #11

0

파일 보기

파일: movieSpider.py 프로젝트: superriver/python_demo

class MoiveSpider(CrawlSpider):
    """docstring for ClassName"""
    name = 'moive'
    allowed_domains = ['movie.douban.com']
    start_urls = ['http://movie.douban.com/top250']
    rules = [
        Rule(LxmlLinkExtractor(allow=(r'\?start = \d+.*'))),
        Rule(LxmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        sel = Selector(response)
        item = MoviceItem()
        item['name'] = sel.xpath(
            "//*div[@id='content']/h1/span[1]/text()").extract()
        item['year'] = sel.xpath("//*[@id='content']/h1/span[2]/text()").re(
            r'\((\d+)\)')
        item['score'] = sel.xpath(
            "//*div[@class='clearfix']/strong/text()").extract()
        item['director'] = sel.xpath(
            "//*div[@id='info']/span[1]/a/text()").extract()
        item['classification'] = sel.xpath(
            "//span[@property='v:genre']/text()").extract()
        item['actor'] = sel.xpath(
            "//*span[@class='actor']//a/text()").extract()
        return item

예제 #12

0

파일 보기

파일: uic_spider.py 프로젝트: anaghadudihalli/UIC-search-engine

    def parse_data(self, response):
        item = CrawlerItem()
        title = response.css("head title::text").extract_first().strip()

        # Extract page title
        if title.endswith(' | University of Illinois at Chicago'):
            title = title[:-36]
        soup = BeautifulSoup(response.text, "html.parser")
        for div in soup.find_all("div", {'class': 'browser-stripe'}):
            div.decompose()

        # Extract page content
        contents = soup.findAll(text=True)
        visible_texts = filter(tag_visible, contents)
        item['content'] = " ".join(t.strip() for t in visible_texts)

        outlinks = []
        le = LxmlLinkExtractor(allow_domains=('uic.edu'),
                               deny_domains=('login.uic.edu'),
                               unique=True,
                               canonicalize=True)
        for link in le.extract_links(response):
            outlinks.append(link.url)

        if title != 'UIC Directory' and title != 'Search Help' and 'uic.edu' in response.request.url:
            item['title'] = title
            item['url'] = response.request.url
            item['outlinks'] = outlinks
            yield item

예제 #13

0

파일 보기

파일: feed_urls.py 프로젝트: rangertaha/osint-spiders

class NewsFeedSpider(CrawlSpider):
    name = 'urls'
    allowed_domains = [domain.strip() for domain in domains]
    start_urls = URLS
    content_types = ['text/xml', 'application/xml', 'rss', 'xml']

    rules = (
        Rule(LxmlLinkExtractor(allow=('.*\.xml$', '.*\.atom$', '.*\.rss$',
                                      '.*\.feed$', '.*\.feeds$'), ),
             callback='parse_item'),
        Rule(LxmlLinkExtractor(allow=('.*xml.*', '.*xml.*', '.*rss.*',
                                      '.*feed.*', '.*feeds.*'), ),
             callback='parse_item',
             follow=True),
        Rule(LxmlLinkExtractor(allow=('.*', ), )),
    )

    def parse_item(self, response):
        cts = response.headers.get('Content-Type')
        for ct in self.content_types:
            if ct in cts:
                print response.url
                url = FeedUrl()
                url['url'] = response.url
                yield url

예제 #14

0

파일 보기

class technode(RedisCrawlSpider):
    # 爬虫名
    name = "technode"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["technode.com",]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = "technode:start_urls"
    # start_urls = ['http://cn.technode.com/']


    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/post/category/[a-z|-]+/', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/post/[\d|-]{10}/.+/', )), callback='parse_item'),
    )


    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] =  response.xpath('//*/header/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('//*/header/div/time/text()').extract()[0]
        item['content_code'] = response.xpath('//*[@id="inner-wrap"]/div[5]/div/div/div/div/div/div[1]/article').extract()[0]


        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #15

0

파일 보기

class LinkSpider(CrawlSpider):
    """
    TODO find a way to keep referer and title of broken href
    """

    name = 'link_spider'
    allow_domains = []
    start_urls = []
    handle_httpstatus_list = [404, 500, 403, 401, 400]
    broken_links = set()

    rules = (
        Rule(LxmlLinkExtractor(allow=(), allow_domains=['qrpay.ai']), callback='parse_obj', follow=True),
        Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=False),
    )

    def parse_obj(self, response):

        if response.status not in ('200', '302', '301', 200, 302, 301):
            print(self.start_domain)
            item = BrokenItem()
            item['url'] = response.url
            item['referer'] = response.request.headers.get('Referer', '').decode('utf-8')
            item['status'] = response.status

            # TODO put into pipline
            if item not in broken_links:
                link = json.dumps(dict(item))
                broken_links.add(link)
                redis_store.lpush('broken_links_%s' % (self.start_domain), link)
            return item

예제 #16

0

파일 보기

class Techqq(RedisCrawlSpider):
    # 爬虫名
    name = "techqq"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["tech.qq.com"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = "techqq:start_urls"
    # start_urls = ['http://tech.qq.com/']


    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/web/[a-z]+\.htm', r'/[a-z]+\.html')), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/a/\d{8}/\d+\.htm', )), callback='parse_item'),
    )


    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('//*[@id="Main-Article-QQ"]/div/div[1]/div[1]/div[1]/div/div[1]/span[3]/text()').extract()[0].strip()
        item['content_code'] = response.xpath('//*[@id="Cnt-Main-Article-QQ"]').extract()[0].strip()


        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #17

0

파일 보기

파일: aitists.py 프로젝트: zzzz123321/News_scrapy_redis

class Aitists(RedisCrawlSpider):
    # 爬虫名
    name = "aitists"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["aitists.com", "mp.weixin.qq.com"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'aitists:start_urls'
    # start_urls = ['http://www.aitists.com/']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/category/.+', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'com/s/.+', )), callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="activity-name"]/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//*[@id="post-date"]/text()').extract()[0].strip()
        item['content_code'] = response.xpath(
            '//*[@id="js_content"]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #18

0

파일 보기

파일: economist.py 프로젝트: zzzz123321/News_scrapy_redis

class Economist(RedisCrawlSpider):
    # 爬虫名
    name = "economist"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["economist.com"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'economist:start_urls'
    # start_urls = ['https://www.economist.com/']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/sections/.+', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/news/.+/.+', )),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//h1/span[2]/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//time[1]/text()').extract()[0].strip()
        item['content_code'] = response.xpath(
            '//main/div/div[1]/div/article/div[1]').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #19

0

파일 보기

class Donews(RedisCrawlSpider):
    # 爬虫名
    name = "donews"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["donews.com",]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'donews:start_urls'
    # start_urls = ['http://www.donews.com/', 'http://www.donews.com/idonews/']


    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'donews.com/[a-z]+/index', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'donews.com/news/detail/\d/\d+\.html',
                                      r'/article/detail/\d+/\d+\.html')), callback='parse_item'),
    )


    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] =  response.xpath('//*[@id="main"]/div[2]/h2/text()').extract()[0]
        item['pub_time'] = response.xpath('//*[@id="main"]/div[2]/div[1]/p/span[2]/text()').extract()[0]
        item['content_code'] = response.xpath('//*[@id="main"]/div[2]/div[2]').extract()[0]

        # 返回每个item
        yield item

예제 #20

0

파일 보기

파일: scrap.py 프로젝트: VickyMutai/email-finder

    def parse(self, response):
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        links = [str(link.url) for link in links]
        links.append(str(response.url))

        for link in links:
            yield scrapy.Request(url=link, callback=self.parse_link)

예제 #21

0

파일 보기

파일: GithubTrendingRepoCrawler.py 프로젝트: Pavneet-Sing/github_trending_bot

class Githubtrendingrepocrawler(CrawlSpider):  # 1
    name = 'GithubTrendingRepoCrawler'  # 2
    start_urls = ['http://github.com/trending/']  # 2

    # 3
    rules = (
        # Extract link from this path only
        Rule(LxmlLinkExtractor(
            restrict_xpaths=["//ol[@id=repo-list]//h3/a/@href"],
            allow_domains=['https://github.com/trending']),
             callback='parse'),
        # link should match this pattern and create new requests
        Rule(LxmlLinkExtractor(allow='https://github.com/[\w-]+/[\w-]+$',
                               allow_domains=['github.com']),
             callback='parse_product_page'),
        # # Recursive Rule
        # Rule(
        #     LxmlLinkExtractor(allow='https://github.com/[\w-]+/[\w-]+$', allow_domains=['github.com']),
        #     callback='parse_product_page', follow=True
        # ),
    )

    # 4
    def parse_product_page(self, response):
        item = PageContentItem()
        item['url'] = response.url
        item['content'] = response.css('article').get()
        yield item

예제 #22

0

파일 보기

    def parsePreDesigne(self, response):
        singleLinks = LxmlLinkExtractor(
            allow=('/pre-designed-home-range/[\w-]+/$'),
            restrict_xpaths=
            '//div[@id="double-storey"]/preceding-sibling::section'
        ).extract_links(response)

        doubleLinks = LxmlLinkExtractor(
            allow=('/pre-designed-home-range/[\w-]+/$'),
            restrict_xpaths=
            '//div[@id="double-storey"]/following-sibling::section'
        ).extract_links(response)

        for link in singleLinks:
            meta = {'storey': 1}
            yield Request(link.url,
                          callback=self.parseItem,
                          dont_filter=True,
                          meta=meta)
        for link in doubleLinks:
            meta = {'storey': 0}
            yield Request(link.url,
                          callback=self.parseItem,
                          dont_filter=True,
                          meta=meta)

예제 #23

0

파일 보기

class icsspider(CrawlSpider):
    download_delay = 3
    retry_times = 10
    name = 'ics'
    start_urls = ['http://ics.cnvd.org.cn/?max=20&offset=400']
    # start_urls = ['http://www.cnvd.org.cn/flaw/list.htm?max=20&offset=3240']
    allowed_domains = ['ics.cnvd.org.cn', 'www.cnvd.org.cn']

    rules = (Rule(
        LxmlLinkExtractor(allow=('/\?max=\d+', ),
                          restrict_xpaths=("//a[@class='nextLink']", ))),
             Rule(LxmlLinkExtractor(allow=('/flaw/show/', ),
                                    restrict_xpaths=("//tbody[@id='tr']", )),
                  follow=True,
                  callback='parse_item'))

    def parse_item(self, response):
        sel = Selector(response)
        try:
            item = ics()
            item['cnvd'] = ''.join(
                sel.xpath("//table[@class='gg_detail']\
                            /tbody/tr[1]/td[2]/text()").extract()).strip()
            yield item
        except exceptions:
            print 'url: %s 解析出错' % response.url
        pass

예제 #24

0

파일 보기

class Finance_ifeng(RedisCrawlSpider):
    # 爬虫名
    name = "finance_ifeng"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["ifeng.com"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'finance_ifeng:start_urls'
    # start_urls = ['http://finance.ifeng.com/',
    #               'http://tech.ifeng.com/',
    #               'http://finance.ifeng.com/stock/gstzgc/']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'ifeng\.com/[a-z]+/', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'ifeng\.com/a/\d{8}/[\d|_]+\.shtml', )),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="artical_topic"]/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//*[@id="artical_sth"]/p/span[1]/text()').extract()[0].strip()
        item['content_code'] = response.xpath(
            '//*[@id="main_content"]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #25

0

파일 보기

class Cyzone(RedisCrawlSpider):
    # 爬虫名
    name = "cyzone"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["cyzone.cn"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'cyzone:start_urls'
    # start_urls = ['http://www.cyzone.cn/']


    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/category/\d+/', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/a/\d{8}/\d+\.html', )), callback='parse_item'),
    )


    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//*[@id="article"]/div[1]/div[1]/h1/text()').extract()[0].strip()
        item['pub_time'] = response.url.split("/")[-2]
        item['content_code'] = response.xpath('//*[@id="article"]/div[1]/div[2]').extract()[0].strip()


        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #26

0

파일 보기

파일: news_sites.py 프로젝트: rangertaha/osint-spiders

class NewsFeedSpider(CrawlSpider):
    name = 'sites'
    allowed_domains = [domain.strip() for domain in domains]
    start_urls = ['http://www.nytimes.com/services/xml/rss/index.html'] #URLS



    rules = (
        # '.*xml.*', '.*xml.*', '.*rss.*', '.*feed.*', '.*feeds.*'
        Rule(LxmlLinkExtractor(
            allow=('.*\.xml$', '.*\.atom$', '.*\.rss$', '.*\.feed$', '.*\.feeds$'),
        ), callback='parse_item'),

        Rule(LxmlLinkExtractor(
            allow=('.*', ),
        )),
    )

    def parse_item(self, response):
        page = feedparser.parse(response.body)


        item = FeedUrl()
        item['url'] = response.url
        print response.url
        return item

예제 #27

0

파일 보기

class Dsj36(RedisCrawlSpider):
    # 爬虫名
    name = "36dsj"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = [
        "36dsj.com",
    ]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = '36dsj:start_urls'
    # start_urls = ['http://www.36dsj.com/']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/archives/category/[a-z|-]+', )),
             follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/archives/\d+', )),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '/html/body/section/div/div/header/h1/a/text()').extract(
            )[0].strip()
        item['pub_time'] = response.xpath(
            '/html/body/section/div/div/header/ul/li[2]/text()').extract()[0]
        item['content_code'] = response.xpath(
            '/html/body/section/div/div/article').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #28

0

파일 보기

class KejijieChannelsSpider(CrawlSpider):

    start_urls = ["http://www.kejilie.com/channelsubscribe.html"]
    name = 'kejilieChannels'
    allowed_domains = ['www.kejilie.com']
    Redis2Info = config.info["Redis2Info"]
    redis_db = StrictRedis(host=Redis2Info['host'],
                           port=Redis2Info['port'],
                           password=Redis2Info['pwd'],
                           db=Redis2Info['db'])
    rules = (Rule(
        LxmlLinkExtractor(allow=('http://www.kejilie.com/channeltype/.*', )),
        follow=True),
             Rule(LxmlLinkExtractor(
                 allow=('http://www.kejilie.com/channel/.*', ),
                 deny=("http://www.kejilie.com/channel/.*/feed")),
                  callback='parseChannel'))

    def parseChannel(self, response):
        info("-----------------kejiliechannels url:" + response.url)
        title = response.xpath(
            "//div[@class='am_news_list_all']//div[@class='am-titlebar am-titlebar-default mt-0']/h1/text()"
        ).extract_first()
        logo = response.xpath(
            "//div[@class='am_news_list_all']//div[@class='mt-10']/div[@class='am-fl']/img/@src"
        ).extract_first()
        self.redis_db.sadd(
            "kejiliechannels",
            json.dumps({
                'url': response.url,
                'title': title,
                'logo': logo
            }))

예제 #29

0

파일 보기

파일: vcbeat.py 프로젝트: zzzz123321/News_scrapy_redis

class Vcbeat(RedisCrawlSpider):
    # 爬虫名
    name = "vcbeat"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = [
        "vcbeat.net",
    ]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = "vcbeat:start_urls"
    # start_urls = ['http://vcbeat.net/', 'http://vcbeat.net/Series/seriesIndex']

    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/seriesD/\d{1,2}', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'vcbeat\.net/.+=', )),
             callback='parse_item'),
    )

    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '//*[@id="article_title"]/p/text()').extract()[0].strip()
        item['pub_time'] = response.xpath(
            '//*[@id="article_title"]/div/span[2]/text()').extract()[0]
        item['content_code'] = response.xpath(
            '/html/body/div[7]/div/div[1]/div[1]').extract()[0]

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #30

0

파일 보기

파일: rsarxiv.py 프로젝트: zzzz123321/News_scrapy_redis

class Syncedreview(RedisCrawlSpider):
    # 爬虫名
    name = "rsarxiv"
    # 爬取域范围, 允许爬虫在这个域名下进行爬取
    allowed_domains = ["rsarxiv.github.io"]
    # 起始url列表, 爬虫执行后的第一批请求, 队列处理
    redis_key = 'rsarxiv:start_urls'
    # start_urls = ['http://rsarxiv.github.io/']


    rules = (
        # 从起始页提取匹配正则式'/channel/\d{1,3}\.html'的链接，并使用parse来解析
        Rule(LxmlLinkExtractor(allow=(r'/tags/.+/', )), follow=True),
        # 提取匹配'/article/[\d]+.html'的链接，并使用parse_item_yield来解析它们下载后的内容，不递归
        Rule(LxmlLinkExtractor(allow=(r'/\d{4}/\d{2}/\d{2}/.+/', )), callback='parse_item'),
    )


    def parse_item(self, response):
        item = NewsItem()
        item['url'] = response.url
        item['title'] = response.xpath('//*[@id="wrapper"]/article/div/header/h1/text()').extract()[0].strip()
        item['pub_time'] = response.xpath('//*[@id="wrapper"]/article/div/header/time/a/text()').extract()[0].strip()
        item['content_code'] = response.xpath('//*[@id="wrapper"]/article/div/div').extract()[0].strip()

        # 返回每个提取到的item数据, 给管道文件处理, 同时还会回来执行后面的代码
        yield item

예제 #31

0

파일 보기

파일: html_helper.py 프로젝트: algolia/docsearch-scraper

def get_links(url, body):
    start_url = url
    if '.html' in start_url:
        start_url = start_url.rsplit('/', 1)[0]

    response = HtmlResponse(
        url=start_url,
        body=body,
        encoding='utf8'
    )

    link_extractor = LxmlLinkExtractor(
        allow=[start_url],
        deny=[],
        tags='a',
        attrs='href',
        canonicalize=True
    )

    return link_extractor.extract_links(response)

예제 #32

0

파일 보기

파일: procedure.py 프로젝트: nevor1530/crawler

class LinkProcedure(BaseProcedure):
    """
    基于scrapy的LxmlLinkExtractor的链接提取器
    link xpath css
    xpath string|array  参考LxmlLinkExtractor的restrict_xpaths
    css string|array  参考LxmlLinkExtractor的restrict_css
    """
    def __init__(self, *args):
        xpath = args[0]
        css = len(args) >= 2 and args[1] or None
        self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath, restrict_css=css)

    def do(self, input_, **kwargs):
        if isinstance(input_, Response):
            links = self._extractor.extract_links(input_)
            return [i.url.strip() for i in links]
        else:
            raise Exception('link input error')

예제 #33

0

파일 보기

파일: procedure.py 프로젝트: nevor1530/crawler

 def __init__(self, *args):
     xpath = args[0]
     css = len(args) >= 2 and args[1] or None
     self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath, restrict_css=css)