Python LinkExtractor 예제들, scrapy.linkextractor.LinkExtractor Python 예제들

예제 #1

0

파일 보기

class Gmw(SpiderRedis):
    name = "gmw"
    website = "光明网"
    allowed_domain = "gmw.cn"
    start_urls = ['http://www.gmw.cn/']

    rules = [
        Rule(LinkExtractor(allow=("content_", ),
                           deny=("sports", "shipin", "health", "shuhua", "run",
                                 "xueshu", "e.gmw.cn", "v.gmw.cn", "gongyi",
                                 "jd", "ny", "guoxue", "history", "sixiang",
                                 "topics", "photo", "cg", "media", "meiwen",
                                 "reader", "bbs", "blog", "travel")),
             callback="get_news",
             follow=True),
        Rule(LinkExtractor(allow=("node_", ),
                           deny=("sports", "shipin", "health", "shuhua", "run",
                                 "xueshu", "e.gmw.cn", "v.gmw.cn", "gongyi",
                                 "jd", "ny", "guoxue", "history", "sixiang",
                                 "topics", "photo", "cg", "media", "meiwen",
                                 "reader", "bbs", "blog", "travel")),
             follow=True)
    ]

    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath(
                    '//h1[@id="articleTitle"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@id="articleTitle"]/text()').extract_first())

            loader.add_value(
                "date",
                response.xpath('//span[@id="pubTime"]/text()').extract_first()
                + ":00")
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@id="contentMain"]/descendant-or-self::text()').
                    extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()

예제 #2

0

파일 보기

 def parse(self, response):
     link = LinkExtractor(restrict_xpaths="//ul[@class='cont_xiaoqu']//li")
     links = link.extract_links(response)
     for link_line in links:
         print(link_line.url,link_line.text)
         item = LinkdemoItem()
         item["url"] = link_line.url
         item["text"] = link_line.text
         yield item

예제 #3

0

파일 보기

파일: book_new.py 프로젝트: NikolaXie/books

 def parse(self, response):
     link_regulation = LinkExtractor(restrict_css='section')
     url_list = link_regulation.extract_links(response)
     if url_list:
         for link in url_list:
             url = link.url
             if 'page-' in url:
                 yield scrapy.Request(url, callback=self.parse)
             else:
                 yield scrapy.Request(url, callback=self.parse_detail)

예제 #4

0

파일 보기

    def parse(self, response):
        link = LinkExtractor(
            restrict_css=
            'body > div.wrap > div.middleright > div > div.cartoon_online_border > ul > li'
        )
        links = link.extract_links(response)
        # link1 = link.extract_links(response)[0]

        for link in links:
            yield Request(url=link.url, callback=self.parse2, dont_filter=True)

예제 #5

0

파일 보기

    def parse(self, response):
        body = Selector(text=response.body)
        images = body.css('img').extract()
        for image in images:
            image = image.encode("utf-8")
            if PexelsScraper.src_extractor.findall(image):
                img_url = PexelsScraper.src_extractor.findall(image)[0]
                if img_url not in PexelsScraper.crawled_urls:
                    if 'http' not in img_url:
                        print img_url
                        print self.start_urls[0]
                        print PexelsScraper.domain_extractor.findall(
                            self.start_urls[0])
                        img_url = PexelsScraper.domain_extractor.findall(
                            self.start_urls[0])[0][0] + img_url
                        print img_url
                    PexelsScraper.crawled_urls.add(img_url)
                    tags = ""
                    img_name = ""
                    img_type = ""
                    if PexelsScraper.tags_extractor.findall(image):
                        tags = PexelsScraper.tags_extractor.findall(
                            image)[0].replace(',', '').lower()
                    print img_url, tags
                    if '/' in img_url and len(
                            PexelsScraper.filename_extractor.findall(
                                img_url)) > 0:
                        img_name = PexelsScraper.filename_extractor.findall(
                            img_url)[0][0]
                        img_type = PexelsScraper.filename_extractor.findall(
                            img_url)[0][1]
                        print img_name
                    data = requests.get(img_url).content
                    im = Image.open(BytesIO(data))
                    width, height = im.size
                    # PexelsScraper.image_width = im.size[0]
                    # PexelsScraper.image_height = im.size[1]
                    img_aspect_ratio = self.calculate_aspect(width, height)
                    yield ImagecrawlerItem(source_url=response.url,
                                           img_url=img_url,
                                           alternate_text=tags,
                                           img_width=width,
                                           img_height=height,
                                           img_name=img_name,
                                           img_type=img_type,
                                           img_aspect_ratio=img_aspect_ratio)

        link_extractor = LinkExtractor()
        next_links = [
            link.url for link in link_extractor.extract_links(response)
            if not self.is_extracted(link.url)
        ]
        # Crawl the filtered links
        for link in next_links:
            yield scrapy.Request(link, self.parse)

예제 #6

0

파일 보기

    def parse(self, response):
        link_extractor = LinkExtractor(allow=RotaractSpider.url_matcher)
        links = [link.url for link in link_extractor.extract_links(response)]

        for link in links:
            flag = True
            article_links = []
            yield scrapy.Request(url=link,
                                 callback=self.parse_articles,
                                 meta={
                                     'article_links': article_links,
                                     'flag': flag
                                 })

예제 #7

0

파일 보기

class BarneysSpider(CrawlSpider):
    name = 'barneys-crawl-spider'
    allowed_domains = ['www.barneys.com']
    start_urls = [
        'https://www.barneys.com/',
        'https://www.barneys.com/global/ajaxGlobalNav.jsp'
    ]
    product_parser = ProductParser()

    custom_settings = {
        'DOWNLOAD_DELAY':
        1,
        'USER_AGENT':
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 '
        'Safari/537.36'
    }

    product_css = ['[id="ajaxGlobalNav"]', '.topnav-level-1']
    listing_css = ['[id="main-container"]']
    rules = [
        Rule(LinkExtractor(restrict_css=product_css),
             callback='parse',
             process_request='set_currency_cookie'),
        Rule(LinkExtractor(restrict_css=listing_css),
             callback='parse_item',
             process_request='set_currency_cookie')
    ]

    def parse(self, response):
        trail = response.meta.get('trail', [])
        title = self.extract_title(response)
        if title:
            trail = trail + [[title, response.url]]

        for request in super().parse(response):
            request.meta['trail'] = trail
            yield request

    def parse_item(self, response):
        return self.product_parser.parse(response)

    def set_currency_cookie(self, request):
        request.cookies['usr_currency'] = 'SE-SEK'
        return request

    def extract_title(self, response):
        title = response.css('title::text').extract_first()
        if title:
            title = title.split('|')[0].strip()

        return title

예제 #8

0

파일 보기

class MovieSubjectSpider(scrapy.Spider):
    name = 'movie_subject'
    allowed_domains = ['m.douban.com']
    start_urls = ['http://m.douban.com/']
    rules = (Rule(LinkExtractor(allow=('movie/subject/(\d).*rec$')),
                  callback='parse_item',
                  follow=True,
                  process_request='cookie'))

    def cookie(self, request):
        bid = ''.join(
            random.choice(string.ascii_letters + string.digits)
            for x in range(11))
        request.cookies['bid'] = bid
        return request

    def start_requests(self):
        for url in self.start_urls:
            bid = ''.join(
                random.choice(string.ascii_letters + string.digits)
                for x in range(11))
            yield Request(url, cookies={'bid': bid})

    def get_douban_id(self, subject, response):
        subject['douban_id'] = response.url[35:-10]
        return subject

    def parse_item(self, response):
        subject = Subject()
        self.get_douban_id(subject, response)
        subject['type'] = 'movie'
        return subject

예제 #9

0

파일 보기

class DoubanSpider(CrawlSpider):

    name = 'douban'

    start_urls = {'https://movie.douban.com/top250/'}
    rules = (Rule(LinkExtractor(allow=r'https://movie.douban.com/top250.*'),
                  callback='parse_item'), )

    def parse_item(self, response):
        items = doubanItem()
        res = Selector(response)
        items['name'] = res.xpath(
            '//div[@class="hd"]/a/span[1]/text()').extract()  # 电影名
        items['imgs'] = res.xpath(
            '//*[@id="content"]/div/div[1]/ol/li/div/div/a/img/@src').extract(
            )
        # 导演,主演
        directors_info = res.xpath(
            '//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()[1]'
        ).extract()
        # 年份, 国家,分类
        movies_info = res.xpath(
            '//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/p[1]/text()[2]'
        ).extract()
        # 评分
        items['rate'] = res.xpath(
            '//span[@class="rating_num"]/text()').extract()
        # print(items)

        return items

예제 #10

0

파일 보기

파일: meizi.py 프로젝트: zhangyuan521/scrapy-mzi-picture

class Spider(CrawlSpider):
    name = 'mzitu'
    allowed_domains = ['mzitu.com', 'meizitu.net']
    start_urls = ['http://www.mzitu.com/']
    img_urls = []
    rules = [
        Rule(LinkExtractor(allow=('http://www.mzitu.com/\d{1,6}', ),
                           deny=('http://www.mzitu.com/\d{1,6}/\d{1,6}')),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        item = MzituScrapyItem()
        max_num = response.xpath(
            '//div[@class="content"]/div[@class="pagenavi"]/a[last()-1]/span/text()'
        ).extract_first(default='N/A')
        item['name'] = response.xpath(
            "//div[@class='main']/div[@class='content']/h2[@class='main-title']/text()"
        ).extract_first(default='N/A')
        for num in range(1, int(max_num) + 1):
            page_url = response.url + '/' + str(num)
            yield Request(page_url, callback=self.img_url)
        item['image_urls'] = self.img_urls
        item['url'] = response.url
        yield item

    def img_url(self, response):
        img_urls = response.xpath(
            '//div[@class="main-image"]/p/a/img/@src').extract()
        for img_url in img_urls:
            self.img_urls.append(img_url)

예제 #11

0

파일 보기

파일: spider.py 프로젝트: weien8899/spider-python

class Spider(CrawlSpider):
    name = 'mzitu'
    allowed_domains = ['mzitu.com']
    start_urls = ['http://www.mzitu.com/']
    img_urls = []
    rules = (
        Rule(LinkExtractor(allow=('http://www.mzitu.com/\d{1,6}',),
                           deny=('http://www.mzitu.com/\d{1,6}/\d{1,6}')),
             callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = MzituScrapyItem()
        max_num = response.xpath(
                "descendant::div[@class='main']/div[@class='content']/div[@class='pagenavi']/a[last()-1]/span/text()").extract_first(
                default="N/A")
        item['name'] = response.xpath("./*//div[@class='main']/div[1]/h2/text()").extract_first(default='N/A')
        for num in range(1, int(max_num)):
            page_url = response.url + '/' + str(num)
            yield Request(page_url, callback=self.img_url)
        item['image_urls'] = self.img_urls
        yield item

    def img_url(self, response):
        image_urls = response.xpath("descendant::div[@class='main-image']/descendant::img/@src").extract()
        for img_url in image_urls:
            self.img_urls.append(img_url)

예제 #12

0

파일 보기

파일: datablogger.py 프로젝트: Saurabhbans87/ScraperProject

class DatabloggerSpider(CrawlSpider):
    name = "datablogger"
    allowed_domains = ["wiprodigital.com"]
    start_urls = ["https://wiprodigital.com/"]
    rules = [
        Rule(LinkExtractor(canonicalize=True, unique=True),
             follow=True,
             callback="parse_items")
    ]

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse, dont_filter=True)

    def parse_items(self, response):
        items = []
        links = LinkExtractor(canonicalize=True,
                              unique=True).extract_links(response)
        for link in links:
            is_allowed = False
            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url:
                    is_allowed = True
            if is_allowed:
                item = DatabloggerScraperItem()
                item['url_from'] = response.url
                item['url_to'] = link.url
                items.append(item)
        return items

예제 #13

0

파일 보기

    def parse_items(self, response):
        # The list of items that are found on the particular page
        items = []
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(allow_domains=self.allowed_domains,
                              canonicalize=True,
                              unique=True).extract_links(response)
        # Now go through all the found links
        for link in links:
            # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
            is_allowed = False

            for allow_domain in self.allowed_domains:
                if allow_domain in link.url:
                    is_allowed = True

            # If it is allowed, create a new item and add it to the list of found items
            if is_allowed:
                item = VnExpressScraperItem()
                item['url_from'] = response.url
                item['url_to'] = link.url
                items.append(item)
            # if is_allowed:
            #     title = response.xpath('//head/title/text()').extract()[0]

            #     texts = response.xpath('//*[not(self::script) and string-length(text()) > 0]/text()').extract()
            #     self.parse_text(title, texts)

            #     self.parse_html(title, response.body)
        return items

예제 #14

0

파일 보기

파일: lk_spider.py 프로젝트: shidenggui/Python_Scripts

class LKSpider(CrawlSpider):
    name = "lk"
    allowed_domains = ['www.lkong.net']
    start_urls = ['http://www.lkong.net/forum-60-1.html']

    rules = (Rule(LinkExtractor(allow=('/forum-60-\d{1,4}\.html', )),
                  callback='parse_page',
                  follow=True), )

    def parse_page(self, response):
        #url = thread
        #for thread in response.xpath('//th[@class="new"]/a/@href').extract():
        #yield scrapy.Request(url, callback=self.parse_thread)
        if response.url not in pages:
            pages.append(response.url)
            with open('page', 'a+') as f:
                f.write(response.url + '\n')

    def parse_thread(self, response):
        item = LkItem()
        item['home'] = response.url
        item['title'] = response.xpath(
            '//h1[@class="ts"]/a[1]/text()').extract()
        item['link'] = response.xpath('//h1[@class="ts"]/a[2]/@href').extract()
        item['content'] = response.xpath(
            '//div[@id="postlist"]/div[1]/descendant::td[@class="t_f"]'
        ).extract()[0].encode('utf8')
        return item

예제 #15

0

파일 보기

 def parse(self, response):
     link = LinkExtractor(
         deny='/fang1/a2/',
         restrict_xpaths=
         '//div[@class="f-filter f-w1190"]//dd[@class="info"]/div[@class="thr-list"]//li[@class="item"]/a'
     )
     links = link.extract_links(response)
     for i in links:
         city_name = re.split('\/', i.url)[-3]
         yield Request(i.url,
                       callback=self.get_index,
                       meta={
                           'city_name': city_name,
                           'dont_redirect': True
                       },
                       dont_filter=True)

예제 #16

0

파일 보기

파일: 12_parse_differently_based_on_rules.py 프로젝트: WooodHead/Python-Web-Scraping-Cookbook

class MySpider(CrawlSpider):
    # The name of the spider
    name = "datablogger"

    # The domains that are allowed (links to other domains are skipped)
    allowed_domains = ["xkcd.com"]

    # The URLs to start with
    start_urls = ["https://xkcd.com/"]

    # This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
    rules = [
        Rule(LinkExtractor(allow=(), canonicalize=True, unique=True),
             follow=True,
             callback="parse_items")
    ]

    # Method which starts the requests by visiting all URLs specified in start_urls
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse, dont_filter=True)

    # Method for parsing items
    def parse_items(self, response):
        print(response)

예제 #17

0

파일 보기

    def parse(self, response):
        UNKNOWN_TYPE = 'unknown'
        SERVER_DATE_FORMAT = '%a, %d %b %Y %H:%M:%S GMT'
        DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%SZ'
        content_type = response.headers.get('Content-Type', UNKNOWN_TYPE)
        date_header = response.headers.get('Date')
        lastmodified_header = response.headers.get('Last-Modified')
        date = None
        lastmodified = None
        if date_header is not None:
            date = datetime.datetime.strptime(date_header, SERVER_DATE_FORMAT).strftime(DATETIME_FORMAT)
        if lastmodified_header is not None:
            lastmodified = datetime.datetime.strptime(lastmodified_header, SERVER_DATE_FORMAT).strftime(DATETIME_FORMAT)

        yield {
                #'headers': response.headers,
                'url_to': response.url,
                'content_type' : content_type,
                'date' : date,
                'source_server' : response.headers.get('Server', UNKNOWN_TYPE),
                'content_length' : response.headers.get('Content-Length', 0),
                'last_modified' : lastmodified
        }

        if self.allowedcontenttype(content_type):
            links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
            for link in links:
                if '..' not in link.url and link.url is not response.url:
                    yield scrapy.Request(link.url)
                    yield scrapy.Request(link.url, method="HEAD")

예제 #18

0

파일 보기

파일: tecent.py 프로젝트: hanyang7427/study

class TecentSpider(scrapy.Spider):
    name = 'tencent'
    allowed_domains = ['hr.tencent.com']
    start_urls = ['http://hr.tencent.com/position.php?start=0#a']
    for i in range(274):
        strI = str(i*10)
        start_urls.append("http://hr.tencent.com/position.php?start="+strI+"#a")
    # start_urls = ['http://hr.tencent.com/position.php?start=0#a',
    #               'http://hr.tencent.com/position.php?start=10#a',
    #               'http://hr.tencent.com/position.php?start=20#a',
    #               'http://hr.tencent.com/position.php?start=20#a',]


    pageLink = LinkExtractor(allow=("start=\d+"))
    # 获取列表里链接,依次发出请求，通过callback依次处理
    rules = [
        Rule(pageLink, callback="parse", follow=True)
    ]

    def parse(self, response):
        for it in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
            item = TencentspiderItem()
            #print(type(item))
            item['positionName'] = it.xpath("./td[1]/a/text()").extract()[0]
            item['positionType'] = it.xpath("./td[2]/text()").extract()[0]
            item['pNum'] = it.xpath("./td[3]/text()").extract()[0]
            item['address'] = it.xpath("./td[4]/text()").extract()[0]
            item['publishTime'] = it.xpath("./td[5]/text()").extract()[0]

            yield item

예제 #19

0

파일 보기

파일: test1spider.py 프로젝트: Activity00/Weixinp

class CrawlSpider(scrapy.CrawlSpider):
    name = 'test2'
    allow_domains = ['', ]
    start_urls = ['', ]
    rules = (
        Rule(LinkExtractor(allow='category\.php', deny=('subsection\.php'))),
        Rule(LinkExtractor(allow='item\.php'),callback='parse_item'),
    )

    def parse_item(self,response):
        self.logger.info('aaa%s',response.url)
        item = scrapy.Item()
        item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
        item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
        item['description'] = response.xpath('//td[@id="item_description"]/text()').extract()
        return item

예제 #20

0

파일 보기

    def parse_items(self, response):
        # The list of items that are found on the particular page
        self.depth -= 1
        if self.depth <= 0:
            return []
        items = []
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True,
                              unique=True).extract_links(response)
        # links = response.xpath('//a[@href]')
        # Now go through all the found links
        for link in links:
            # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
            is_allowed = False

            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url and len(link.url) > len(
                        response.url):
                    is_allowed = True

            article_tag = response.xpath(
                "//li[contains(@class, 'article__loader')]")
            if not article_tag:
                is_allowed = False
            # If it is allowed, create a new item and add it to the list of found items
            if is_allowed:
                item = NewsScraperItem()
                item['url_from'] = response.url
                item['url_to'] = link.url
                items.append(item)

        # Return all the found items
        return items

예제 #21

0

파일 보기

class DatabloggerSpider(scrapy.Spider):
    name = 'datablogger'
    allowed_domains = ['www.mctopherganesh.com']
    start_urls = ['http://www.mctopherganesh.com/']
    rules = [
    	Rule(
    		LinkExtractor(
    			canonicalize=True,
    			unique=True
    		),
    		follow=True,
    		callback="parse"
    	)
    ]

    def start_requests(self):
    	for url in self.start_urls:
    		yield scrapy.Request(url, callback=self.parse, dont_filter=True)

    def parse(self, response):
        items = []
        links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
        for link in links:
        	is_allowed=False
        	for allowed_domain in self.allowed_domains:
        		if allowed_domain in link.url:
        			is_allowed=True
        	if is_allowed:
        		item = DatabloggerScraperItem()
        		item['url_from'] = response.url
        		item['url_to'] = link.url
        		items.append(item)
        return items

예제 #22

0

파일 보기

class LkLoginSpider(CrawlSpider):
    name = "lkl"
    allowed_domains = ['www.lkong.net']
    #start_urls = ['http://www.lkong.net/member.php?mod=logging&action=login']
    #start_urls = ['http://www.lkong.net/forum-14-1.html']

    rules = (Rule(LinkExtractor(allow=('/thread.+\.html', )),
                  callback='parse_thread'), )

    def start_requests(self):
        return [
            scrapy.FormRequest(
                'http://www.lkong.net/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=L7On7&inajax=1',
                formdata={
                    'username': '******',
                    'password': '******',
                    'answer': 'email',
                    'formhash': 'forumhash',
                    'referer': '/forum.php',
                    'questionid': '0',
                    'loginsubmit': 'True',
                    'cookietime': '2592000'
                },
                callback=self.after_login)
        ]

    def after_login(self, response):
        #print response.body.decode('utf8')
        for url in start_url:
            yield self.make_requests_from_url(url)

    def parse_thread(self, response):
        print response.body.decode('utf8')
        print response.url

예제 #23

0

파일 보기

class mzitu_spider(CrawlSpider):
    name = 'mzitu'

    start_urls = {'http://www.mzitu.com/all/'}

    rules = {
        Rule(LinkExtractor(allow=r'http://www.mzitu.com/\d{1,6}',
                           deny=r'http://www.mzitu.com/\d{1,6}/\d{1,6}'),
             callback='parse_item',
             follow=True)
    }
    img_urls = []

    def parse_item(self, response):
        item = MzituSpiderItem()
        total_pages = response.xpath(
            '/html/body/div[2]/div[1]/div[4]/a[5]/span/text()').extract()[
                0]  # str
        item['name'] = response.xpath(
            '/html/body/div[2]/div[1]/h2/text()').extract()
        item['url'] = response.url  # 用来设置中间件里面浏览器请求头的referer参数,
        for i in range(1, int(total_pages) - 1):
            page_url = response.url + '/' + str(i)  # 每页的图片地址
            yield scrapy.Request(page_url, callback=self.img_url)
        item['img_urls'] = self.img_urls
        yield item

    def img_url(self, response):
        img_urls = response.xpath(
            "/html/body/div[2]/div[1]/div[3]/p/a/img/@src").extract()
        for img_url in img_urls:
            self.img_urls.append(img_url)

예제 #24

0

파일 보기

파일: my_scrapper.py 프로젝트: tanvirfahim15/LinkScrapper

    def parse_items(self, response):
        # The list of items that are found on the particular page
        items = []
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True,
                              unique=True).extract_links(response)
        # Now go through all the found links
        for link in links:
            # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
            is_allowed = False
            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url:
                    is_allowed = True
            # If it is allowed, create a new item and add it to the list of found items
            if is_allowed:
                item = MyscraperItem()
                item['link'] = link.url
                items.append(item)
                patterns = [
                    "kalerkantho.com/online/national/",
                    "kalerkantho.com/online/Politics/",
                    "kalerkantho.com/online/Court/",
                    "kalerkantho.com/online/world/",
                    "kalerkantho.com/online/business/",
                    "kalerkantho.com/online/sahitya/",
                    "kalerkantho.com/online/sport/",
                    "kalerkantho.com/online/entertainment/",
                    "kalerkantho.com/online/info-tech/",
                    "kalerkantho.com/online/prescription/"
                ]

                file = None
                if patterns[0] in link.url:
                    file = open('../../data/national.csv', 'a')
                if patterns[1] in link.url:
                    file = open('../../data/politics.csv', 'a')
                if patterns[2] in link.url:
                    file = open('../../data/court.csv', 'a')
                if patterns[3] in link.url:
                    file = open('../../data/world.csv', 'a')
                if patterns[4] in link.url:
                    file = open('../../data/business.csv', 'a')
                if patterns[5] in link.url:
                    file = open('../../data/literature.csv', 'a')
                if patterns[6] in link.url:
                    file = open('../../data/sports.csv', 'a')
                if patterns[7] in link.url:
                    file = open('../../data/entertainment.csv', 'a')
                if patterns[8] in link.url:
                    file = open('../../data/tech.csv', 'a')
                if patterns[9] in link.url:
                    file = open('../../data/medical.csv', 'a')

                if file != None:
                    file.write(urlShortener(link.url) + "\n")
                    file.close()

        # Return all the found items
        return items

예제 #25

0

파일 보기

 def get_index(self, response):
     city_name = response.meta['city_name']
     link = LinkExtractor(
         allow='/fang1/.*htm',
         restrict_xpaths=
         '//div[@class="f-main f-clear f-w1190"]//div[@class="f-main-list"]/div[@class="f-list js-tips-list"]/div'
     )
     links = link.extract_links(response)
     for i in links:
         city = re.split('\/|\.', i.url)[2]
         yield Request(i.url,
                       callback=self.get_message,
                       meta={
                           'city': city,
                           'city_name': city_name,
                           'dont_redirect': True
                       },
                       dont_filter=True)

예제 #26

0

파일 보기

class AbyznewslinksSpider(CrawlSpider):
    name = 'abz'
    depth = 400
    # The domains that are allowed (links to other domains are skipped)
    allowed_domains = ['thestar.com']

    # The URLs to start with
    start_urls = ['https://www.thestar.com/']

    # This spider has one rule: extract all (unique and canonicalized) links,
    # follow them and parse them using the parse_items method
    rules = [
        Rule(LinkExtractor(canonicalize=True, unique=True),
             follow=True,
             callback="parse_items")
    ]

    # Method which starts the requests by visiting all URLs specified in start_urls
    def start_requests(self):

        for url in self.start_urls:
            yield scrapy.Request(url, callback=self.parse, dont_filter=False)

    # Method for parsing items
    def parse_items(self, response):
        # The list of items that are found on the particular page
        self.depth -= 1
        if self.depth <= 0:
            return []
        items = []
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=True,
                              unique=True).extract_links(response)
        # links = response.xpath('//a[@href]')
        # Now go through all the found links
        for link in links:
            # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
            is_allowed = False

            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url and len(link.url) > len(
                        response.url):
                    is_allowed = True

            article_tag = response.xpath(
                "//li[contains(@class, 'article__loader')]")
            if not article_tag:
                is_allowed = False
            # If it is allowed, create a new item and add it to the list of found items
            if is_allowed:
                item = NewsScraperItem()
                item['url_from'] = response.url
                item['url_to'] = link.url
                items.append(item)

        # Return all the found items
        return items

예제 #27

0

파일 보기

파일: multiplepages.py 프로젝트: wreyesus/python-modules

class ComputrabajoSpider(CrawlSpider):

    name = "mi primer crowlspider"
    start_urls = ['https://www.ve.computrabajo.com/ofertas-de-trabajo/']
    allowed_domain = ['www.ve.computrabajo.com/']

    rules = (
    	    Rule(LinkExtractor(allow=r'p=')),
            Rule(LinkExtractor(allow=r'/oferta-de-trabajo-de-'), callback='parse_items'),
            )

    def parse_items(self, response):
        item = ItemLoader(Articulos(), response)     
        item.add_xpath('title', '//*[@id="MainContainer"]/article/section[1]/div[1]/div/h2/text()')
        item.add_xpath('description', '//*[@id="MainContainer"]/article/section[1]/div[2]/ul/li[3]/text()')
        yield item.load_item()

# scrapy runspider multiplepages.py -o ../../resources/computrabajo.csv -t csv

예제 #28

0

파일 보기

파일: ahthor.py 프로젝트: zybk01/ppython

    def callload(self,response):

        link = LinkExtractor(restrict_xpaths='//*[@cellspacing="1"]//a')
        link = link.extract_links(response)
        for urllist in link:
            url=urllist.url
            if url in self.loaded:
                pass
            else:
                self.loaded.append(url)

                request = scrapy.Request(url, callback=self.parse,
                                         headers={'User-Agent': 'Mozilla/5.0'},
                                         dont_filter=True)
                path = self.path + '/'+urllist.text
                request.meta['item'] = path
                yield request
            time.sleep(2)

예제 #29

0

파일 보기

파일: chinanews.py 프로젝트: taianjianbing/web_news

class Chinanews(SpiderRedis):
    name = "chinanews"
    website = u"中国新闻网"
    allowed_domain = "chinanews.com"
    start_urls = ['http://www.chinanews.com/']

    rules = [
        Rule(LinkExtractor("\d{4}/\d{2}-\d{2}/\d{7}.shtml$"),
             callback="get_news",
             follow=True),
        Rule(LinkExtractor("scroll-news", "china", "world", "society",
                           "finance", "business", "fortune", "gangao",
                           "taiwan", "huaren", "theory", "life"),
             follow=True)
    ]

    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@id="cont_1_1_2"]/h1[1]/text()').extract_first())
            loader.add_value(
                "date",
                response.xpath(
                    '//span[@id="pubtime_baidu"]/text()').extract_first())
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@class="left_zw"]/descendant-or-self::text()').
                    extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()

예제 #30

0

파일 보기

    def parse(self, response):
        USER = True
        next_links = []
        body = Selector(text=response.body)
        images = body.css('img.photo-item__img').extract()
        for image in images:
            img_url = PexelsScraper.src_extractor.findall(image)[0]
            tags = [
                tag.replace(',', '').lower() for tag in
                PexelsScraper.tags_extractor.findall(image)[0].split(' ')
            ]
            print("Tags_check: ")
            print tags
        link_extractor = LinkExtractor(allow=PexelsScraper.url_matcher)
        next_links = [
            link.url for link in link_extractor.extract_links(response)
            if not self.is_extracted(link.url)
        ]  # Crawl the filtered links
        next_page_url = response.css(
            'div.pagination a[rel="next"]::attr(href)').extract_first()
        if next_page_url:
            next_page_url = URL + next_page_url
            next_links.append(next_page_url)
        print("next_page_url")
        print(next_page_url)
        if USER:
            links = response.css("a.pull-left::attr(href)").extract_first()
            print(links)
            if links:
                links = "https://www.pexels.com" + links
                for i in range(10):
                    next_links.append(links + "?page=" + str(i))
                print("go into user parse")
                #request.meta['main_url'] = URL
                #yield request
                for each in next_links:
                    yield scrapy.Request(each, self.parse_by_user)
                print("should have done user parse")
                print("Links_check: {}".format(links))

        for link in next_links:
            print("next_links")
            print link
            yield scrapy.Request(link, self.parse)