Python SgmlLinkExtractor.SgmlLinkExtractor примеры, scrapy.contrib.linkextractors.sgml.SgmlLinkExtractor.SgmlLinkExtractor Python примеры использования

Пример #1

0

Показать файл

Файл: yidong.py Проект: wys43cc/scrapy_example

class YidongSipder(Spider):
    name = "zhaobiao"
    allowed_domains = ["b2b.10086.cn"]
    start_urls = [
        "https://b2b.10086.cn/b2b/main/listVendorNotice.html?noticeType=2"
    ]
    rules = (Rule(SgmlLinkExtractor(allow=('/b2b.10086.cn/', )),
                  callback='parse_page',
                  follow=True), )

    def parse(self, response):
        message = Selector(response)
        item = YidongItem()
        item['company'] = message.xpath(
            '//table//tr/td/text()').extract_first()
        print item['company']
        item['title'] = message.xpath('//table//tr/td/text()').extract_first()

        return item

Пример #2

0

Показать файл

Файл: articleSpider.py Проект: lanms/learning-notes

class ArticleSpider(CrawlSpider):
    #log.start(logfile='log.txt', loglevel=log.CRITICAL)
    name = "article"
    allowed_domains = ["en.wikipedia.org"]
    start_urls = [
        "http://en.wikipedia.org/wiki/Python_%28programming_language%29"
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('(/wiki/)((?!:).)*$'), ),
             callback="parse_item",
             follow=True)
    ]

    def parse_item(self, response):
        item = Article()
        title = response.xpath('//h1/text()')[0].extract()
        print("Title is: " + title)
        item['title'] = title
        return item

Пример #3

0

Показать файл

class GutenbergSpider(CrawlSpider):
    
    name = "gutenberg.org"
    allowed_domains = ["gutenberg.org"]
    # Hard code for King Arthur for now...
    start_urls = ["http://www.gutenberg.org/ebooks/12753"]
    rules = [Rule(SgmlLinkExtractor(allow=["/ebooks/\d+"])), "parse_book_record")]

    def parse_book_record(self, response):
        self.log("Parsing %s", response.url)

        selector = HtmlXPathSelector(response)

        book = GutenbergItem()
        # But what if there are more than one h1s in the document?
        # Weird itemprop attribute in the gutenberg code.
        book["title"] = selector.select("//h1/text()").extract()
        spam = selector.select("//a[@class='link']").extract()
        print spam

Пример #4

0

Показать файл

class GmwSpider(CrawlSpider):
    name = 'dangdang'
    allowed_domains = ['bang.dangdang.com']
    start_urls = [
        'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-month-2016-'
        + str(m) + '-1-' + str(x) for m in range(1, 5) for x in range(1, 26)
    ]
    rules = (Rule(SgmlLinkExtractor(allow=r'01.00.00.00.00.00-month'),
                  callback='parse_item'), )

    def parse_item(self, response):
        i = GmrbItem()
        url = response.url
        i['name'] = response.xpath(
            '//ul[@class="bang_list clearfix bang_list_mode"]').extract()
        i['ban'] = ''
        i['date'] = ''
        i['content'] = ''
        return i

Пример #5

0

Показать файл

    def parse(self, response):
        url = response.url
        if 'redirect_urls' in response.meta:
            url = response.meta['redirect_urls'][0]

        stop_spreading = False
        self.depth_lock.acquire()
        if url not in self.depth:
            print "WEIRD " + url
            stop_spreading = True
        elif self.depth[url] >= self.max_depth:
            stop_spreading = True
        self.depth_lock.release()

        if stop_spreading == True:
            return

        link_extractor = SgmlLinkExtractor(unique=False)
        links = link_extractor.extract_links(response)

        requests = []
        print len(links)
        for link in links:
            if link.text.strip() != '':
                self.output_file_lock.acquire()
                self.output_file.write(url)
                self.output_file.write('|')
                self.output_file.write(link.url)
                self.output_file.write('|')
                self.output_file.write(link.text.encode('utf-8', 'replace'))
                self.output_file.write('\n')
                self.output_file_lock.release()

                self.depth_lock.acquire()
                should_spread = False
                if link.url not in self.depth or self.depth[
                        url] + 1 < self.depth[link.url]:
                    self.depth[link.url] = self.depth[url] + 1
                    should_spread = True
                self.depth_lock.release()

                if should_spread == True:
                    yield Request(url=link.url)

Пример #6

0

Показать файл

class RecursiveSpider(CrawlSpider):
    name = "tn"
    allowed_domains = ["tn8.tv"]
    start_urls = ['http://www.tn8.tv/ultima-hora/']

    rules = (Rule(SgmlLinkExtractor(allow=("/ultima-hora/*", )),
                  callback='parse_item',
                  follow=False), )

    def parse_item(self, response):
        sel = Selector(response)
        item = RecursiveItem()
        item['URL'] = response.request.url
        idTitulo = item['URL'].split('/')[4].split('-')[0]
        item['TITLE'] = sel.xpath('//*[@id="post-' + idTitulo +
                                  '"]/header/h1/text()').extract()
        item['CONTENT'] = sel.xpath('//*[@id="post-' + idTitulo +
                                    '"]/div[2]/p[1]/text()').extract()
        return item

Пример #7

0

Показать файл

Файл: test_contrib_linkextractors.py Проект: tskylee/scrapy

    def test_process_value(self):
        """Test restrict_xpaths with encodings"""
        html = """
        <a href="javascript:goToPage('../other/page.html','photo','width=600,height=540,scrollbars'); return false">Link text</a>
        <a href="/about.html">About us</a>
        """
        response = HtmlResponse("http://example.org/somepage/index.html",
                                body=html,
                                encoding='windows-1252')

        def process_value(value):
            m = re.search("javascript:goToPage\('(.*?)'", value)
            if m:
                return m.group(1)

        lx = SgmlLinkExtractor(process_value=process_value)
        self.assertEqual(
            lx.extract_links(response),
            [Link(url='http://example.org/other/page.html', text='Link text')])

Пример #8

0

Показать файл

Файл: test_contrib_linkextractors.py Проект: tskylee/scrapy

    def test_restrict_xpaths_encoding(self):
        """Test restrict_xpaths with encodings"""
        html = """<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <div class='links'>
        <p><a href="/about.html">About us\xa3</a></p>
        </div>
        <div>
        <p><a href="/nofollow.html">This shouldn't be followed</a></p>
        </div>
        </body></html>"""
        response = HtmlResponse("http://example.org/somepage/index.html",
                                body=html,
                                encoding='windows-1252')

        lx = SgmlLinkExtractor(restrict_xpaths="//div[@class='links']")
        self.assertEqual(
            lx.extract_links(response),
            [Link(url='http://example.org/about.html', text=u'About us\xa3')])

Пример #9

0

Показать файл

Файл: url.py Проект: zsjtoby/cmdbac

    def __init__(self, *args, **kwargs):
        super(UrlSpider, self).__init__(*args, **kwargs)

        self.start_urls = [kwargs.get('start_url')]

        follow = True if kwargs.get('follow') == 'true' else False
        self.rules = (Rule(SgmlLinkExtractor(allow=('')),
                           callback='parse_url',
                           follow=follow), )
        super(UrlSpider, self)._compile_rules()

        try:
            proxy = kwargs.get('proxy')
            service_args = [
                '--proxy=' + proxy,
                '--proxy-type=http',
            ]
        except:
            service_args = None

Пример #10

0

Показать файл

Файл: articleSpider.py Проект: jjwujiajun/python-scraper

class ArticleSpider(Spider):
    name = "article"
    allowed_domains = ["en.wikipedia.org"]
    start_urls = [
        "http://en.wikipedia.org/wiki/Main_Page",
        "http://en.wikipedia.org/wiki/Python_%28programming_language%29"
    ]
    rules = [
        Rule(SgmlLinkExtractor(allow=('(/wiki/)((?!:).)*$'), ),
             callback="parse_item",
             follow=True)
    ]

    def parse_item(self, response):
        item = Article()
        title = response.xpath('//h1/text()')[0].extract()
        print "Title is: " + title
        # item['title'] = title
        return item

Пример #11

0

Показать файл

    def parse(self, response):

        sel = Selector(response)
        try:
            cur_page = sel.xpath(
                '//div[@class="multi-page"]/i[@class="curr"]/text()').extract(
                )[0]
            links = SgmlLinkExtractor(
                allow=(r'http://.+.anjuke.com/prop/view/.+'),
                restrict_xpaths=('//ul[@id="houselist-mod"]'),
                unique=0).extract_links(response)
            r = Redis()
            for link in links:
                try:
                    r.lpush('anjuke_beijing_spider:data_url', link.url)
                except Exception as e:
                    print Exception, ":", e
        except Exception as e:
            print Exception, ":", e

Пример #12

0

Показать файл

class MissionSpider(CrawlSpider):
    name = "mission"
    allowed_domains = ["missionchamber.com", "services.missionchamber.com"]
    start_urls = [
        "http://services.missionchamber.com/list/QuickLinkMembers/AllCategories.htm"
    ]

    rules = (Rule(SgmlLinkExtractor(allow=('\/member\/.+\.htm')),
                  callback='parse_item'), )

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        business = Business()
        business['id'] = ''.join(re.findall(r"\-(\d+)\.htm",
                                            response.url)).strip()
        business['name'] = self.get_one(
            hxs, "//div[@class='cm_memheader']//h1/text()")
        business['contact'] = self.get_one(
            hxs, "//span[@class='cm_repname']//text()")

        contact = self.get_one(
            hxs, "//td[@class='cm_infotext'][position()<2]/text()")
        bits = re.findall(r'(.+)\n(.+)\n*(.+)?\n*(.+)?', contact)
        if bits:
            if len(bits[0]) >= 3:
                business['phone'] = bits[0][2]

            if len(bits[0]) >= 4:
                business['fax'] = ''.join(re.findall(r':(.+)',
                                                     bits[0][3])).strip()

            if len(bits[0]) >= 2:
                business['address'] = '\n'.join(bits[0][0:2])

        business['website'] = self.get_one(
            hxs, "//tr[@class='cm_noprint']/td[@class='cm_infotext']/a/@href")
        business['categories'] = self.get_one(
            hxs, "//span[@class='cm_member_categories']/text()")
        return business

    def get_one(self, hxs, selector):
        return '\n'.join(hxs.select(selector).extract()).strip()

Пример #13

0

Показать файл

Файл: ddd-odesk_example1_7.py Проект: yuandra/scraperwiki-scraper-vault

class GrSpider(CrawlSpider):
    name = "lawyercom"
    login_page = 'http://www.lawyercom.ru/login/'
    allowed_domains = ["lawyercom.ru"]
    start_urls = ('http://www.lawyercom.ru',)

    rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//a',)),callback='parse_item'),)                                           

    def init_request(self):
        """This function is called before crawling starts."""
        return Request(url=self.login_page, callback=self.login)

    def login(self, response):
        """Generate a login request."""
        return FormRequest.from_response(response,
                    formdata={'username': '******', 'password': '******'},
                    callback=self.check_login_response)

    def check_login_response(self, response):
        """Check the response returned by a login request to see if we are
        successfully logged in.
        """
        if "section" in response.body:
            self.log("Successfully logged in. Let's start crawling!")
            # Now the crawling can begin..
            self.initialized()
        else:
            self.log("Bad times :(")
            # Something went wrong, we couldn't log in, so nothing happens.

    def parse_item(self, response):
        item = GrItem() 
        soup = BeautifulSoup(response.body)
        hxs = HtmlXPathSelector(response=response)

        item['url'] = response.url
        try:
            item['found'] = ','.join(set(email_pattern.findall(str(response.body))))
        except:
            item['found'] = None

        return item

Пример #14

0

Показать файл

 def parsePodcast(self, response):
     hxs = HtmlXPathSelector(response)
     
     try:
         title = hxs.select('//div[contains(@id,"title")]/h1/text()').extract()[0]
     except:
         title = None
     
     try: 
         author = hxs.select('//div[contains(@id,"title")]/h2/text()').extract()[0]
     except:
         author = None
         
     try:
         category = hxs.select('//li[contains(@class,"genre")]/a/text()').extract()[0]
     except:
         category = None
         
     try:
         lang = hxs.select('//li[contains(@class,"language")]/text()').extract()[0]
     except:
         lang = None
     
     try:
         extractor = SgmlLinkExtractor(restrict_xpaths='//a[contains(text(),"Podcast Website")]')
         website = extractor.extract_links(response)[0].url
         
         #website = hxs.select('//a[contains(text(),"Podcast Website")]/@href').extract()[0]
     except:
         website = None
     
     try:
         price = hxs.select('//td[contains(@class,"price")]/span/text()').extract()[0]
     except:
         price = None
     
     item = PodcastItem(title=title,author=author,category=category,lang=lang,website=website,price=price)
     
     if website is not None and len(website) > 0:
         request = Request(website, callback=self.parsePage)
         request.meta['item'] = item
         yield request

Пример #15

0

Показать файл

class MySpider(CrawlSpider):
	name = "craig"
	allowed_domains=["craigslist.org"]
	start_urls = ["http://sfbay.craigslist.org/search/npo"]

	rules = (
		Rule(SgmlLinkExtractor(allow=(),restrict_css=('a.button.next')),callback="parse_items",follow=True),
		)


	def parse_items(self,response):
		# hxs = Selector(response)
		# titles = hxs.xpath("//span[@class='p1']")
		# items =[]
		# for titles in titles:
		# 	item = CraiglistSampleItem()
		# 	item["title"] = titles.xpath("a/text()").extract()
		# 	item["link"] = titles.xpath("a/@href").extract()
		# 	items.append(item)
		# return items

		#titles = response.css("p.result-info").extract()
		
		
		
		titles = response.css("li.result-row p.result-info a::text").extract()
		links = response.css("li.result-row p.result-info a::attr('href')").extract()
			
		info = zip(titles,links)
		items =[]
		for group in info:

			if group[1] != "#" and "\n" not in group[0]: 
				 item = CraiglistSampleItem()
				 item["title"] = group[0]
				 item["link"] = group[1]
				 items.append(item)

		print items


		return items

Пример #16

0

Показать файл

Файл: publix_spider.py Проект: gregaz/Chewpon

class PublixSpider(CrawlSpider):
   name = "publix"
   allowed_domains = ["publix.com"]
   start_urls = [
       "http://mobilead.publix.com/publixmobile/Default.aspx?action=browsecategoryl1&storeid=2500144&viewmode=0&cattreeid=5117975",
   ]
   rules = (
        # Extract links matching 'category.php' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        #Rule(SgmlLinkExtractor(allow=('default\.aspx', ), deny=('subsection\.php', ))),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        Rule(SgmlLinkExtractor(allow=()), callback='parseCoupons'),
    )

   def parseDept(self, response):
       pass
   
   
   
   
   def parseCoupons(self, response):
       #filename = 'test'
       #open(filename, 'wb').write(response.body)
       start = response.body.find('<div id="iphoneline">')
       end = response.body.find('<div id="footerContainer">')

       response_new = response.replace(body=response.body[start:end])
	   
       hxs = HtmlXPathSelector(response_new)
              
       item = PublixItem()
       item['title'] = 			hxs.select("//div[@class='product-title']/text()").extract()
       item['price'] =      	hxs.select("//div[@class='product-deal']/text()").extract()
       item['priceDetails'] =	hxs.select("//div[@class='product-pricequal']/text()").extract()
       item['desc'] =			hxs.select("//div[@class='product-desc']/text()").extract()
       item['expiry'] =			hxs.select("//div[@class='validdate']/text()").extract()
        
       item['link'] = 			response.url
       item['image'] = 			hxs.select("//img/@src").extract()
	          
       return item

Пример #17

0

Показать файл

Файл: test2_22.py Проект: yuandra/scraperwiki-scraper-vault

class OneSpider(CrawlSpider):
    name = "01_crawl"
    DOWNLOAD_DELAY = 10
    start_urls = ["http://bit.ly/M1deTs"]
    rules = (Rule(SgmlLinkExtractor(allow=('symb=',)), follow=True, callback='parse_item'),)

    def parse_item(self, response):
           hxs = HtmlXPathSelector(response)
           sites = hxs.select('//ul[@class="row_view"]/li')          
           items = []
           for site in sites:
               item = GameItem()
               item['title'] = (site.select('div/h2/a/text()').extract()).pop()
               item['url'] = (site.select('div/h2/a/@href').extract()).pop()
               price = site.select('div/div[2]//div/div/table/tr/td/span/span[@class="currency"]/text()').re('(\d+)')
               item['price'] = float(price[0]) + 0.01*float(price[-1])
               item['quantity'] = len(site.select('div/div/div/form/div/table/tr/td/select/option/text()').extract())
               item['timestamp'] = datetime.datetime.now()
               items.append(item)
           return items

Пример #18

0

Показать файл

class PyVideoSpider(CrawlSpider):
    name = 'pyvideo.org'
    allowed_domains = ['pyvideo.org']
    start_urls = ['http://www.pyvideo.org/speaker/']

    rules = (
        # Extract links matching speakers
        Rule(SgmlLinkExtractor(allow=('/speaker/\d+/', )),
             callback='parse_speaker'), )

    def parse_speaker(self, response):
        sel = Selector(response)
        name = sel.xpath('//h1/text()').extract()[0].strip()
        for conf in sel.xpath('//div[@class="video-summary-data"]'):
            speaker = Speaker()
            speaker['name'] = name
            conf_text = conf.select('.//a/text()')[1].extract()
            speaker['conference'] = re.sub('\s20\d\d$', '', conf_text)
            speaker['year'] = conf.re('20\d\d')[0]
            yield speaker

Пример #19

0

Показать файл

Файл: withproxies.py Проект: kreedz/Scrapy

class TestProxiesSpider(CrawlSpider):
    name = "testproxies"
    allowed_domains = ["habrahabr.ru"]
    start_urls = ["http://habrahabr.ru/page%s" % page for page in xrange(1, 4)]

    rules = (Rule(SgmlLinkExtractor(allow=('page')), callback='parse_item'), )

    def parse_item(self, response):
        hxs = Selector(response)
        divs = hxs.xpath("//div[@class='posts shortcuts_items']/div")
        items = []
        i = 0
        for div in divs:
            item = TestItem()
            item['id'] = div.xpath(
                "//div[@class='published']/following-sibling::h1/a/@href").re(
                    r'\d+')[i]
            i += 1
            items.append(item)
        return items

Пример #20

0

Показать файл

class PandoraSpider(CrawlSpider):
    name = "pandora_spider"
    allowed_domains = ["pandora-and-pandora.blogspot.com"]
    start_urls = ["http://pandora-and-pandora.blogspot.com"]
    manager = ItemManager(name)
    rules = (Rule(SgmlLinkExtractor(), callback='parse_item', follow=True), )

    def process_links(self, response):
        pass

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        paragraphs = hxs.select(r"//div//text()")

        for paragraph in paragraphs:
            paragraph = paragraph.extract().strip()
            paragraph = converter.convert(paragraph, 'zawgyi', 'unicode')

            if is_large_paragraph(paragraph) and is_mainly_myanmar(paragraph):
                self.manager.add_item(paragraph)

Пример #21

0

Показать файл

Файл: crawlernewsspider.py Проект: hongxin001/news-combinator

class TencentNewsSpider(CrawlSpider):
    name = 'tencent_news_spider'
    allowed_domains = ['news.qq.com']
    start_urls = ['http://news.qq.com']
    url_pattern = r'(.*)/a/(\d{8})/(\d+)\.htm'
    rules = [Rule(SgmlLinkExtractor(allow=[url_pattern]), 'parse_news')]
    
    def parse_news(self, response):
        sel = Selector(response)
        pattern = re.match(self.url_pattern, str(response.url))
        item = TencentItem()
        item['source'] = 'tencent' # pattern.group(1)
        item['date'] = pattern.group(2)
        item['newsId'] = pattern.group(3)
        item['cmtId'] = (sel.re(r"cmt_id = (.*);"))[0] # unicode string
        item['comments'] = {'link':str('http://coral.qq.com/')+item['cmtId']}
        item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
        item['contents']['title'] = sel.xpath('//h1/text()').extract()[0]
        item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract())
        return item

Пример #22

0

Показать файл

Файл: sublink1.py Проект: jlibera94/Python

class MySpider(CrawlSpider):
    name = "craigs"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = ["http://sfbay.craigslist.org/search/npo"]

    rules = (Rule(SgmlLinkExtractor(
        allow=(), restrict_xpaths=('//a[@class="button next"]', )),
                  callback="parse_items",
                  follow=True), )

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.xpath('//span[@class="pl"]')
        items = []
        for titles in titles:
            item = CraigslistSampleItem()
            item["title"] = titles.xpath("a/text()").extract()
            item["link"] = titles.xpath("a/@href").extract()
            items.append(item)
        return (items)

Пример #23

0

Показать файл

class StackSpider(CrawlSpider):
    name = "stackcrawl"
    allowed_domains = ["stackoverflow.com"]
    start_urls = [
        "https://stackoverflow.com/questions?sort=newest",
    ]
    rules = (Rule(SgmlLinkExtractor(allow=('&page=\d')),
                  callback='parse',
                  follow=True), )

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        questions = hxs.xpath('//div[@class="summary"]/h3')
        for question in questions:
            item = StackItem()
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            item['url'] = question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            yield item

Пример #24

0

Показать файл

    def parse_url(self, response):
        htmlFile = str(response.body)
        for link in SgmlLinkExtractor(allow="news," + self.code +
                                      "(.*?)html").extract_links(response):
            exist = self.redisUtils.isExist(link.url)
            if (not exist):
                self.redisUtils.saveUrl(link.url)

        while True:
            print 'get a detail job'
            url = self.redisUtils.getUrl()
            url = str(url)
            if (cmp(url, 'None')):
                url = str(url)
                if 'news' in url:  #the get the detail from the page
                    print 'get urls from the page:' + url
                    yield Request(url, callback=self.parse_detail)
            else:
                break
        return

Пример #25

0

Показать файл

class RecursiveSpider(CrawlSpider):
    name = "nd"
    allowed_domains = ["elnuevodiario.com.ni"]
    start_urls = ['http://www.elnuevodiario.com.ni/sucesos/']

    rules = (Rule(SgmlLinkExtractor(allow=("/sucesos/*", )),
                  callback='parse_item',
                  follow=False), )

    def parse_item(self, response):
        sel = Selector(response)
        item = RecursiveItem()
        item['URL'] = response.request.url
        item['TITLE'] = sel.xpath(
            '/html/body/main/section/section/section/header/section/div[1]/h1/text()'
        ).extract()
        item['CONTENT'] = sel.xpath(
            '/html/body/main/section/section/section/section/div[2]/p/text()'
        ).extract()
        return item

Пример #26

0

Показать файл

Файл: weknowfuture.py Проект: galacticsurfer/blog-crawler

class BlogSpider(CrawlSpider):
    name = "weknowfuture"
    allowed_domains = ["weknowfuture.blogspot.in"]
    start_urls = ["http://weknowfuture.blogspot.in/"]
    rules = (
        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        Rule(SgmlLinkExtractor(allow=('\.html', )),
             callback='parse_item',
             follow=True), )

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        item = BlogCrawlerItem()
        item['title'] = hxs.select(
            ".//*[@id='Blog1']/div[1]/div/div/div/div[1]/h3").re('>\n(.*?)\n<')
        item['link'] = hxs.select(
            ".//*[@id='Blog1']/div[1]/div/div/div/div[1]/div/div/a").re(
                '>(.*?)<')
        item['url'] = response.url
        return item

Пример #27

0

Показать файл

class SubtitleSpider(CrawlSpider):
    name = "ss"
    allowed_domains = ['www.springfieldspringfield.co.uk', 'springfieldspringfield.co.uk']
    start_urls = [ss_base_url]
    rules = [Rule(SgmlLinkExtractor(allow=['/view_episode_scripts.php\?tv-show=the-simpsons&episode=\w+']), 'parse_script')]

    def fix_field_names(self, field_name):
        field_name = re.sub(" ","_", field_name)
        field_name = re.sub(":","", field_name)
        return field_name

    def parse_script(self, response):
        x = HtmlXPathSelector(response)

        script = Script()

        script['url'] = response.url
        script['episode_name'] = "".join(x.select("//h3/text()").extract())
        script['script'] = "\n".join(x.select("//div[@class='episode_script']/text()").extract())
        return script

Пример #28

0

Показать файл

Файл: gr_spider.py Проект: zwanthor/Classifiers

class GrSpider(CrawlSpider):
    name = "gr"
    allowed_domains = ["goodreads.com"]
    start_urls = ["https://www.goodreads.com/quotes/tag/love"]
    #recursively go through all pages with quotes
    rules = (Rule(SgmlLinkExtractor(allow=('/quotes/tag/love?page', )),
                  callback="parse",
                  follow=True), )

    def parse(self, response):
        item = GrItem()
        #doesn't pick up full quote for quote split in between br
        item['quotes'] = list()
        for res in response.xpath("//div[@class=\"quoteText\"]"):
            res = res.xpath('normalize-space(./text())').extract()[0].encode(
                'ascii', "replace")[1:-1]
            item['quotes'].append(res)
        item['link'] = response.xpath(
            '//a[@class=\"next_page\"]/@href').extract()
        yield item

Пример #29

0

Показать файл

class RecursiveSpider(CrawlSpider):
    name = 'recursive'
    allowed_domains = ['cse.iitd.ernet.in']
    start_urls = ['http://www.cse.iitd.ernet.in/~naveen/']

    rules = (Rule(
        SgmlLinkExtractor(allow=('cse\.iitd\.ernet\.in/\~naveen/.*', )),
        callback='parse_item',
        follow=True), )

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        i = NewscrapyItem()
        i['URL'] = response.request.url
        i['content'] = hxs.select(
            '/html/body/table/tbody/tr[3]/td[1]/text()[1]').extract()
        #i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract()
        #i['name'] = hxs.select('//div[@id="name"]').extract()
        #i['description'] = hxs.select('//div[@id="description"]').extract()
        return i

Пример #30

0

Показать файл

class TrySpider(CrawlSpider):
    handle_httpstatus_list = [302]
    name = "try"  # Name of your spider
    # Add allowed domains, leave it blank to allow everything
    allowed_domains = ["Try.in"]
    start_urls = ['http://www.Try.in/']  # Has the start URL

    # allow add some regex or links to allow those
    # callback function for a url when allowed by the rul
    rules = (Rule(SgmlLinkExtractor(allow=(".*", ), unique=True),
                  callback='parse_item',
                  follow=True), )

    def parse_item(self, response):
        sel = Selector(response)
        items = []
        item = TryItem()
        item['Source_Website'] = "http://www.Try.in/"
        item['Title'] = sel.xpath('Your_x_path').extract()
        return item

Python SgmlLinkExtractor.SgmlLinkExtractor примеры использования