Exemplo n.º 1
0
 def parse_question(self, response):
     item = StackItem()
     item["title"] = response.css(
         "#question-header h1 a::text").extract()[0]
     item["url"] = response.url
     item["content"] = response.css(".question .post-text").extract()[0]
     yield item
Exemplo n.º 2
0
    def parse(self, response):
        questions = Selector(response).xpath('//*[@class="toanvancontent"]/p')
        print("Question Len = " , len(questions))
        result_string = ""
        for question in questions:
            item = StackItem()
            # item['textContent'] = question.xpath(u'//em/text()').extract().encode('utf-8').strip()
            # item['textContent'] = unicode(str(question.xpath(u'//em/text()').extract()), "utf-8")

            print(len(question.xpath('./text()').extract()))
            print(len(question.xpath('./em/text()').extract()))
            print(len(question.xpath('./strong/text()').extract()))

            for sentence in question.xpath('.//strong/text()').extract():
                print(str(sentence.encode('utf-8').strip()))
                result_string += str(sentence.encode('utf-8').strip())

            for sentence in question.xpath('.//em/text()').extract():
                print(str(sentence.encode('utf-8').strip()))
                result_string += str(sentence.encode('utf-8').strip())

            for sentence in question.xpath('.//text()').extract():
                print(str(sentence.encode('utf-8').strip()))
                result_string += str(sentence.encode('utf-8').strip())
            # print(result_string)
            print("---------------------")
            # print(question)
        item['textContent'] = result_string
        yield item
Exemplo n.º 3
0
    def parse_product(self, response):
        def extract_with_css(query):
            return response.css(query).get(default='').strip()

        product = StackItem()
        product['brand'] = extract_with_css(
            'p.Text.Text--body-1.Text--left.Text--bold.Text--small.Text--\$magenta-50::text'
        )
        product['name'] = extract_with_css(
            'span.Text.Text--subtitle-1.Text--left.Text--small.Text--text-20::text'
        )
        product['price'] = extract_with_css(
            'span.Text.Text--title-6.Text--left.Text--bold.Text--small.Text--neutral-80::text'
        )
        product['details'] = "".join(
            response.css(
                'div.ProductDetail__ProductRow div#productDetails.ProductDetail__productDetails div.ProductDetail__productContent *::text'
            ).extract())
        product['how_to_use'] = extract_with_css(
            'div.ProductDetail__howToUse div.Collapsible div.Collapsible__contentOuter div.Collapsible__contentInner div.ProductDetail__productContent *::text'
        )
        product['ingredients'] = extract_with_css(
            'div.ProductDetail__ingredients div.Collapsible div.Collapsible__contentOuter div.Collapsible__contentInner div.ProductDetail__productContent *::text'
        )
        product['image_url'] = extract_with_css('img::attr(src)')
        product['product_url'] = response.request.url

        yield product
Exemplo n.º 4
0
	def parse(self, response):
		questions = Selector(response).xpath('//div[@class="summary"]/h3')
		for question in questions:
			item = StackItem()
			item['title'] = question.xpath('a[@class="question-hyperlink"]/text()').extract()[0]
			item['url'] = question.xpath('a[@class="question-hyperlink"]/@href').extract()[0]
			yield item
Exemplo n.º 5
0
 def parse_page(self, response):
     paras = Selector(response).xpath('//div/p')
     for para in paras:
         item = StackItem()
         item['url'] = response.url
         item['desc'] = para.xpath('text()').extract() 
         yield item 
Exemplo n.º 6
0
    def parse(self, response):
        posts = Selector(response).xpath('//p[@class="title"]/a')

        for post in posts:
            item = StackItem()
            item['title'] = post.xpath('text()').extract()[0]
            item['url'] = post.xpath('@href').extract()[0]
            yield item
Exemplo n.º 7
0
 def parse(self, response):
     title = response.css('div.summary >h3>a::text').getall()
     urls = response.css('div.summary >h3>a::attr(href)').getall()
     for title, url in zip(title, urls):
         item = StackItem()
         item['title'] = title
         item['url'] = url
         yield item
  def parse(self,response):
    questions=Selector(response).xpath("//div[@class='summary']/h3")

    for question in questions:
      item=StackItem()
      item["title"]=question.xpath("a[@class='question-hyperlink']/text()").extract()[0]
      item["url"]=question.xpath("a[@class='question-hyperlink']/@href").extract()[0]
      yield item
Exemplo n.º 9
0
 def parse(self, response):
     titles = Selector(response).xpath('//td[@class="title"]')
     for title in titles:
         item = StackItem()
         item['title'] = title.xpath(
             'a[@class="storylink"]/text()').extract()[0]
         item['url'] = title.xpath(
             'a[@class="storylink"]/@href').extract()[0]
         yield item
Exemplo n.º 10
0
    def parse_item(self, response):

        for ele in response.xpath('//div[@class="list-group"]/a') :
            i = StackItem()

            i['name'] = ele.xpath('text()').extract()[0]
            i['url'] = ele.xpath('@href').extract()[0]

            yield i
Exemplo n.º 11
0
 def parse(self, response):
     questions = Selector(response).xpath('//div[@class="courseblock"]')
     # questions = Selector(response).xpath('//div[@class="courseblock"]/button/h3')
     for question in questions:
         item = StackItem()
         item['code'] = question.xpath(
             'button/h3/span[@class="code"]/text()').extract()[0].replace("\u00a0", " ")
         # item['code'] = question.xpath('div[@class="course-section"]/')
         item['prerequisites'] = question.xpath('div/div/div/p/a[@class="bubblelink code"]/text()').extract()
         yield item
Exemplo n.º 12
0
 def parse(self, response):
     articles = Selector(response).xpath(
         '//div[@class="search-result-story__container"]/h1[@class="search-result-story__headline"]'
     )
     for article in articles:
         item = StackItem()
         item['domain'] = "bloomberg.com"
         item['title'] = article.xpath('a/text()').extract()[0]
         item['url'] = article.xpath('a/@href').extract()[0]
         yield item
Exemplo n.º 13
0
    def parse(self, response):
        questions = Selector(response).xpath('//div[@class="summary"]')

        for question in questions:
            item = StackItem()
            item["url"] = question.xpath(
                'h3/a[@class="question-hyperlink"]/@href').extract_first()
            item["title"] = question.xpath(
                'h3/a[@class="question-hyperlink"]/text()').extract_first()
            item["question"] = ""
            yield item
Exemplo n.º 14
0
 def parse_node(self, response, node):
     #self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract()))
     response.selector.remove_namespaces()
     item = StackItem()
     item['title'] = node.xpath('normalize-space(title/text())').extract()
     item['url'] = node.xpath('link/text()').extract()
     item['authorname'] = node.xpath('dc:creator/text()').extract()
     item['publicationdate'] = node.xpath('pubdate/text()').extract()
     item['desc'] = node.xpath('description/text()').extract()
     item['content'] = node.xpath('content:encoded/text()').extract()
     yield item
Exemplo n.º 15
0
    def parse(self, response):
        questions = HtmlXPathSelector(response).select(
            '//div[@class="summury"]/h3')

        for question in questions:
            item = StackItem()
            title = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            url = questionxpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            print title, "\n", url, "\n"
Exemplo n.º 16
0
    def parse(self, response):
        sites = Selector(response).xpath('//div/h4')
        for site in sites:
            item = StackItem()
            print site
            item['title'] = site.xpath("a/text()").extract() 
            item['url'] = 'http://reason.org'+str(site.xpath("a/@href").extract_first()) 
            #item['description'] = site.xpath("following-sibling::div/text()").extract_first('').strip()
            yield item

        for link in LxmlLinkExtractor(allow=()).extract_links(response):
            yield Request(link.url, callback=self.parse_page)
Exemplo n.º 17
0
    def parse_node(self, response, node):
        #self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract()))

        item = StackItem()
        item['title'] = node.xpath('title/text()').extract()
        item['url'] = node.xpath('link/text()').extract()
        item['authorname'] = node.xpath('author/text()').extract()
        item['publicationdate'] = node.xpath('pubDate/text()').extract()
        item['desc'] = ''
        item['content'] = node.xpath('description/text()').extract()
        
        yield item
Exemplo n.º 18
0
    def parse(self, response):
        questions = Selector(response).xpath('//div[@class="product-container text-left product-block"]')

        for question in questions:
            item = StackItem()
            item['url'] = question.xpath(
                'div[@class="product-image-container image"]/a[@class="product_img_link"]/@href').extract()[0]
            item['title'] = question.xpath(
                'div[@class="product-image-container image"]/a[@class="product_img_link"]/@title').extract()[0]
            item['price'] = question.xpath(
                'div[@class="product-meta"]/div[@class="clearfix"]/div[@class="content_price"]/span[@class="price product-price "]/text()').extract()[0]
            
            yield item
Exemplo n.º 19
0
 def parse_items(self, response):
     item = StackItem()
     item["company"]     = 'tuCasa'
     item["url"]         = response.url
     item["title"]       = self.format_xpath(response, '/html/body/div[5]/h1/text()')
     item["price"]       = self.format_xpath(response, '/html/body/div[5]/div[1]/span[1]/text()').split()[0]
     item["update_date"] = self.format_xpath(response, '/html/body/div[6]/div/div[3]/span[2]/text()').split()[2]
     item["rooms"]       = self.format_xpath(response, '/html/body/div[6]/div/div[1]/ul/li[4]/text()').split()[0]
     item["surface"]     = self.format_xpath(response, '/html/body/div[6]/div/div[1]/ul/li[2]/span/text()').split()[0]
     item["location"]    = self.format_xpath(response, '/html/body/div[5]/span/text()')
     item["description"] = ''.join(response.xpath('/html/body/div[6]/div/div[1]/text()').extract()).strip()
     
     yield item
Exemplo n.º 20
0
    def parse_item(self, response):
        questions = response.xpath('//div[@class="summary"]/h3')

        for question in questions:
            item = StackItem()
            item['url'] = question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
            #i['name'] = response.xpath('//div[@id="name"]').extract()
            #i['description'] = response.xpath('//div[@id="description"]').extract()
            yield item
Exemplo n.º 21
0
    def parse_items(self, response):
        item = StackItem()
        item["company"] = 'idealista'
        item["url"] = response.url
        #item["title"]       = self.format_xpath(response, '//*[@id="dvTitulo"]/div[1]/h1/text()')
        #item["price"]       = self.format_xpath(response, '//*[@id="dvTitulo"]/div[2]/div[1]/div/div/span/text()').split()[0]
        #item["update_date"] = self.format_xpath(response, '//*[@class="gray-light size12"]/text()').split()[2]
        #item["rooms"]       = self.format_xpath(response, '//*[@id="dvTitulo"]/div[1]/div[1]/div[2]/text()').split()[0]
        #item["surface"]     = self.format_xpath(response, '//*[@id="dvTitulo"]/div[1]/div[1]/div[1]/text()').split()[0]
        #item["location"]    = self.format_xpath(response, '/html/body/div[5]/span/text()')
        #item["description"] = ''.join(response.xpath('//*[@class="description"]/text()').extract()).strip()

        yield item
Exemplo n.º 22
0
    def parse(self, response):
        questions = Selector(response).xpath('//div[@class="summary"]')
	
        for question in questions:
            item = StackItem()
            item['title'] = question.xpath(
                'h3/a[@class="question-hyperlink"]/text()').extract()[0]
            item['url'] = question.xpath(
                'h3/a[@class="question-hyperlink"]/@href').extract()[0]
	    item['username'] = question.xpath(
		'.//div[@class="user-details"]/a/text()').extract()[0]
            item['useraddress'] = question.xpath(
		'.//div[@class="user-details"]/a/@href').extract()[0]
            yield item
Exemplo n.º 23
0
    def parse(self, response):
        questions = Selector(response).xpath('//div[@class="post-title"]')

        for question in questions:
            #print question
            #iprint 'guru'
            item = StackItem()
            item['title'] = question.xpath(
                'a/text()').extract()
            item['url'] = question.xpath(
                'a/@href').extract()[0]
            yield item 
        for link in LxmlLinkExtractor(allow=()).extract_links(response):
            yield Request(link.url, callback=self.parse_page)
Exemplo n.º 24
0
    def parse(self, response):
        questions = response.xpath('//div[@class="summary"]/h3')
        for question in questions:
            item = StackItem()
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            item['url'] = "http://stackoverflow.com" + question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            yield item

        if len(response.xpath("//a[@rel='next']")):
            url = response.xpath("//a[@rel='next']/@href").extract()[0]

            yield scrapy.Request("http://stackoverflow.com" + url, callback=self.parse)
Exemplo n.º 25
0
    def parse(self, response):
        questions = Selector(response).xpath('//a[@class="list-item-link"]')

        for question in questions:
            item = StackItem()
            url = question.xpath('@href').extract()[0]
            item['url'] = question.xpath('@href').extract()[0]
            item['title'] = question.xpath(
                'div[@class="info-wrap"]/p[@class="list-title"]/text()'
            ).extract()[0]
            item['desc'] = question.xpath(
                'div[@class="info-wrap"]/div[@class="list-desc"]/text()'
            ).extract()[0]
            request = scrapy.Request(url, callback=self.parse_content)
            request.meta['item'] = item
            yield request
Exemplo n.º 26
0
 def parse(self, response):
     question = Selector(response).xpath('//div[@id="content"]')
     item = StackItem()
     item["url"] = question.xpath(
         '//div[@id="question-header"]/h1/a[@class="question-hyperlink"]/@href'
     ).extract_first()
     item["title"] = question.xpath(
         '//div[@id="question-header"]/h1/a[@class="question-hyperlink"]/text()'
     ).extract_first()
     q_text = response.xpath(
         '//div[@class="postcell post-layout--right"]/div[@class="post-text"]/p'
     )
     q_text = [i.extract() for i in q_text]
     q_text = " ".join(q_text)
     q_text = tag.sub("", q_text)
     item["question"] = q_text
     yield item
Exemplo n.º 27
0
    def parse(self, response):
        questions = Selector(response).xpath('//main[@class="site-main"]/article')
        #print "guru"+str(questions)
        for question in questions:


            item = StackItem()
            item['title'] = question.xpath(
                'header/h2/a/text()').extract()
            item['url'] = question.xpath(
                'header/h2/a/@href').extract()
            item['authorname'] = question.xpath('footer/span/span/a[@class="url fn n"]/text()').extract()
            item['authorurl'] = question.xpath('footer/span/span/a[@class="url fn n"]/@href').extract()
            item['publicationdate'] = question.xpath('footer/span/a/time[@class="entry-date published updated"]/text()').extract()
            item['content'] = question.xpath('div[@class="entry-content"]/p/text()').extract()

            yield item
Exemplo n.º 28
0
    def parse(self, response):
        questions = Selector(response).xpath('//div[@class="summary"]/h3')

        for question in questions:
            item = StackItem()
            item['title'] = question.xpath(
                'a[@class="question-hyperlink"]/text()').extract()[0]
            item['url'] = question.xpath(
                'a[@class="question-hyperlink"]/@href').extract()[0]
            yield item

        NEXT_PAGE_SELECTOR = "//a[@title='go to page 2']/@href"
        next_page = response.xpath(NEXT_PAGE_SELECTOR)
        if next_page:
            path = next_page.extract_first()
            nextpage = response.urljoin(path)
            print("Found url: {}".format(nextpage))
            yield scrapy.Request(nextpage, callback=self.parse)
    def parse(self, response):
        stackitem = StackItem()
        for item in response.css("li.item"):
            stackitem['sku'] = item.xpath("@data-item-sku").extract()

            stackitem['productName'] = item.xpath(
                ".//h2/a/text()").extract_first()
            stackitem['price'] = item.xpath(
                './/span[@class="price"]/text()').extract_first()
            stackitem['imageUrl'] = item.xpath('.//img/@src').extract_first()
            yield scrapy.Request(item.xpath(
                './/a[@class="product-image"]/@href').extract_first(),
                                 callback=self.parse_item,
                                 meta={'stackitem': stackitem})
        next_page_url = response.xpath(
            ".//a[@class='next']/@href").extract_first()
        if next_page_url:
            yield scrapy.Request(response.urljoin(next_page_url))
Exemplo n.º 30
0
    def parse_items(self, response):
        item = StackItem()
        item["company"] = 'habitaclia'
        item["url"] = response.url
        item["title"] = self.format_xpath(response,
                                          '//*[@class="h1ficha"]/text()')
        item["price"] = self.format_xpath(
            response, '//*[@class="precio-ficha"]/span/text()').split()[0]
        item["detailReference"] = self.format_xpath(
            response, '//*[@class="referencia-ficha"]/span/text()')
        item["rooms"] = ""
        item["surface"] = ""
        #item["update_date"] = self.format_xpath(response, '//*[@id="contents_n"]/div[4]/div/div[1]/span/text()').split()[4].replace("(","").replace(")","")
        #item["rooms"]       = self.format_xpath(response, '//*[@class="detail-rooms"]/text()').split()[0]
        #item["surface"]     = self.format_xpath(response, '//*[@class="detail-m2"]/text()')[:-1]
        #item["location"]    = self.format_xpath(response, '/html/body/div[5]/span/text()')
        #item["description"] = ''.join(response.xpath('//*[@id="description"]/text()').extract()).strip()

        yield item