예제 #1
0
 def parse(self, response):
     for sel in response.xpath('//ul/li'):
         item = HelloItem()
         item['title'] = sel.xpath('a/text()').extract()
         item['link'] = sel.xpath('a/@href').extract()
         item['desc'] = sel.xpath('text()').extract()
         return item
예제 #2
0
 def parse(self, response):
     for quote in response.css('div.quote'):
         l = MainLoader(item=HelloItem(), selector=quote,response=response)
         l.add_css('text', 'span.text::text')  
         l.add_css('author', '.quote small.author::text')
         l.add_css('tags', 'div.tags a.tag::text') 
         yield l.load_item()
예제 #3
0
파일: spider.py 프로젝트: neverqaz/code
 def parse(self, response):
     sel = scrapy.Selector(response)
     print(response.body)
     for h3 in sel.xpath('//h3').extract():
         for a in sel.xpath('//h3/a/@href').extract():
             a = self.url + a
             yield scrapy.Request(a)  # meta={'dont_redirect':True}
             # title = scrapy.Field()
             # type1 = scrapy.Field()
             # actor = scrapy.Field()
             # languge = scrapy.Field()
             # time = scrapy.Field()
             # link = scrapy.Field()
             # desc = scrapy.Field()
             # img = scrapy.Field()
             # last_updated = scrapy.Field(serializer=str)
             h1 = sel.xpath('//h1/text()').extract()
             type = sel.xpath(
                 '//span[@class="span_block"]/a/text()').extract()
             actor = sel.xpath(
                 '//*[@id="minfo"]/div[2]/span[4]/span/a/text()').extract()
             desc = sel.xpath(
                 '//*[@id="movie_content"]/text()[1]').extract()
             link = sel.xpath(
                 '//*[@id="myform"]/ul/li[2]/span[2]/a/@href').extract()
             #languge=sel.xpath('//')
             yield HelloItem(title=h1,
                             type1=type,
                             actor=actor,
                             desc=desc,
                             link=link)
예제 #4
0
 def parse_item(self, response):
     for sel in response.css('div.post-desc'):
         subsel = sel.css('div.post-title h2.entry-title a')
         item = HelloItem()
         item['title'] = subsel.xpath('text()').extract()
         item['text'] = sel.css('div.post-excerpt::text').extract()
         item['link'] = subsel.xpath('@href').extract()
         yield item
예제 #5
0
    def parse(self, response):
        # title=response.xpath("/html/head/title").extract()

        for sel in response.xpath('//a'):
            item = HelloItem()
            item['title'] = sel.xpath('text()').extract()
            item['link'] = sel.xpath('@href').extract()
            # item['desc'] = sel.xpath('text()').extract()
            yield item
예제 #6
0
    def parse_item(self, response):
        self.browser = webdriver.PhantomJS()
        self.browser.implicitly_wait(10)
        self.browser.get(response.url)
        item = HelloItem()
        item['app_name'] = response.xpath(
            '//div[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[1]/div[1]/text()'
        ).extract()[0]
        item['score'] = response.xpath(
            '//*[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[2]/div[2]/text()'
        ).extract()[0]
        item['detail'] = response.xpath('//*[@id="J_DetAppDataInfo"]/div[1]'
                                        ).xpath('string(.)').extract()[0]
        item['down_num'] = response.xpath(
            '//*[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[3]/div[1]/text()'
        ).extract()[0]
        item['category'] = response.xpath(
            '//*[@id="J_DetCate"]/text()').extract()[0]
        item['com_num'] = self.browser.find_element_by_xpath(
            '//*[@id="J_CommentCount"]').text
        #com_list=self.browser.find_element_by_id("J_DetShowCommentList")
        try:
            for i in range(0, 100):
                self.browser.find_element_by_id(
                    "J_DetCommentShowMoreBtn").click()
        except:
            print "error"

        item['com_name'] = self.browser.find_element_by_id(
            "J_DetShowCommentList").text
        #item['com_time']=self.browser.find_element_by_xpath('//*[@id="J_DetShowCommentList"]/li[2]/div[1]/div[1]/div[3]').text
        #item['com_text']=self.browser.find_elements_by_xpath('//*[@id="J_DetShowCommentList"]/li[2]/div[1]/div[2]').text

        #print "got name " + self.browser.find_element_by_class_name("comment-name").text
        #for com_name in com_names:
        #   print "get name "+com_name.text
        #for com_text in com_texts:
        #   print "get text "+com_text.text
        #print "got date "+ self.browser.find_element_by_class_name("comment-date").text
        #print "got title " +
        #print "got text " + self.browser.find_element_by_class_name("comment-datatext").text
        #print "got details "+ item['detail']
        #print "got category "+ item['category']
        yield item

        #for com_site in com_sites:
        #   print com_site,xpath('div[1]/div[1]/div[1]/text()').extract()[0]

        def __del__(self):
            self.browser.close()
예제 #7
0
    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = HelloItem()
            text = quote.css('.text::text').extract_first()  # 输出文本值
            author = quote.css(
                '.author::text').extract_first()  # extract_first()第一个
            tags = quote.css('.tags .tag::text').extract()  # extract() 所有
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)  # 获取绝对的路径
        print(url)
        yield scrapy.Request(url=url, callback=self.parse)
예제 #8
0
파일: hello_il.py 프로젝트: amozie/amozie
 def parse(self, response):
     l = ItemLoader(item=HelloItem(), response=response)
     l.add_css('text', 'h2.entry-title a::text')
     l.add_css('text', 'h2.entry-title a::attr(href)')
     return l.load_item()