def scrape_home(self, response): # use the response xpath here # h1_tag = response.xpath('//h1/a/text()').extract()[0] # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() # yield { # 'h1': h1_tag, # 'Tags': tags # } open_in_browser(response) # using debuging loader = ItemLoader(item=QuotesSpiderItem(), response=response) quotes = response.xpath('//*[@class="quote"]') for quote in quotes: text = quote.xpath('.//*[@class="text"]/text()').extract_first() author = quote.xpath( './/*[@itemprop="author"]/text()').extract_first() tags = quote.xpath( './/*[@itemprop="keywords"]/@content').extract_first() loader.add_value('text', text) loader.add_value('author', author) loader.add_value('tags', tags) yield loader.load_item() next_page_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() absolut_next_page_url = response.urljoin(next_page_url) yield Request(absolut_next_page_url)
def parse(self, response): # h1_tags = response.xpath('//h1/a/text()').extract_first() # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() # yield {'H1 Tag': h1_tags, 'Tags':tags} l=ItemLoader(item=QuotesSpiderItem(),response=response) quotes = response.xpath('//*[@class="quote"]') for quote in quotes: text = quote.xpath('.//*[@class="text"]/text()').extract_first() author = quote.xpath( './/*[@itemprop="author"]/text()').extract_first() tags = quote.xpath( './/*[@itemprop="keywords"]/@content').extract_first() l.add_value('author',author) l.add_value('tags',tags) l.add_value('text',text) # yield {"Text": text, # "Author": author, # "Tags": tags} yield l.load_item() next_page_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() absolute_next_page_url = response.urljoin(next_page_url) yield scrapy.Request(absolute_next_page_url)
def parse(self, response): l = ItemLoader(item=QuotesSpiderItem(), response=response) # h1_tag = response.xpath('//h1/a/text()').extract_first() # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() # # l.add_value('h1_tag', h1_tag) # l.add_value('tags', tags) # ------------------ quotes = response.xpath('//*[@class="quote"]') for quote in quotes: text = quote.xpath('.//*[@class="text"]/text()').extract_first() author = quote.xpath( './/*[@class="author"]/text()').extract_first() tags = quote.xpath('.//*[@class="tag"]/text()').extract() l.add_value('text', text) l.add_value('author', author) l.add_value('tags', tags) print('----------------------\n', text, author.upper(), tags) # yield {'Text': text, # 'Author': author, # 'Tags': tags} next_page_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() absolute_next_page_url = response.urljoin(next_page_url) yield scrapy.Request(absolute_next_page_url) # ------------------------- return l.load_item()
def parse(self, response): item_loader = ItemLoader(item=QuotesSpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class="tags-item"]/a/text()').extract() item_loader.add_value('h1_tag', h1_tag) item_loader.add_value('tags', tags) return item_loader.load_item()
def scrape_home_page(self, response): # open_in_browser(response) l = ItemLoader(item=QuotesSpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item()
def parse(self, response): l = ItemLoader(item=QuotesSpiderItem(),response=response) heading = response.xpath("//h1/text()").extract_first() link = response.xpath("//a/@href").extract_first() l.add_value('heading',heading) l.add_value('link',link) return l.load_item()
def parse(self, response): for quote in response.css('div.quote'): item = QuotesSpiderItem() item['text'] = quote.css('span.text::text').extract_first() item['author'] = quote.xpath('span/small/text()').extract_first() yield item next_page = response.css('li.next a::attr("href")').extract_first() if next_page is not None: yield response.follow(next_page, self.parse)
def scrape_home_page(self, response): # for debugging only open_in_browser(response) l = ItemLoader(item=QuotesSpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item() # Commenting all out. # def parse(self, response): # l = ItemLoader(item=QuotesSpiderItem(), response=response) # # Commenting this out. Just for understanding. # h1_tag = response.xpath('//h1/a/text()').extract_first() # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() # # yield {'H1 Tag': h1_tag, 'Tags': tags} # l.add_value('h1_tag', h1_tag) # l.add_value('tags', tags) # return l.load_item() # # quotes = response.xpath('//*[@class="quote"]') # # for quote in quotes: # # text = quote.xpath('.//*[@class="text"]/text()').extract_first() # # author = quote.xpath('.//*[@itemprop="author"]/text()').extract_first() # # tags = quote.xpath('.//*[@itemprop="keywords"]/@content').extract_first() # # # tags = quote.xpath('.//*[@class="tag"]/text()').extract() # # # If you want to print the data. # # # print('\n') # # # print(text) # # # print(author) # # # print(tags) # # # print('\n') # # yield{'Text':text, # # 'Author':author, # # 'Tags':tags} # # next_page_url = response.xpath('//*[@class="next"]/a/@href').extract_first() # # absolute_next_page_url = response.urljoin(next_page_url) # # yield scrapy.Request(absolute_next_page_url)
def parse(self, response): l = ItemLoader(item=QuotesSpiderItem(), response=response) # extract title of page h1_tag = response.xpath('//h1/a/text()').extract_first() # extract 10 tags on left side of page tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item()
def parse(self, response): # define itemloader l = ItemLoader(item =QuotesSpiderItem(), reponser = response) # callback function h1_tag = response.xpath('.//h1/a/text()').extract_first() tags = response.xpath('.//*[@class="tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item()
def home_page_advanced(self, response): """ Scrape homepage with help of pipelines.py & items.py """ # open_in_browser(response) l = ItemLoader(item=QuotesSpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) return l.load_item()
def parse(self, response): l = ItemLoader(item=QuotesSpiderItem(), response=response) quotes = response.xpath("//div[@class='quote']") for quote in quotes: text = quote.xpath(".//*[@class='text']/text()").extract_first() author = quote.xpath( ".//*[@itemprop='author']/text()").extract_first() tags = quote.xpath(".//*[@itemprop='keywords']/@content").extract() l.add_value('text', text) l.add_value('author', author) l.add_value('tags', tags) yield l.load_item() """
def parse(self, response): l = ItemLoader(item=QuotesSpiderItem(), response=response) # h1_tag = response.xpath('//h1/a/text()').extract_first() # tags = response.xpath('//*[@class="tag-item"]/a/text()').extract() # # yield to print to scrappy output # yield {'H1 Tag': h1_tag, 'Tags': tags} quotes = response.xpath('//*[@class="quote"]') for quote in quotes: # quote.xpath('.//*[@itemprop="text"]/text()').extract_first() text = quote.xpath('.//*[@class="text"]/text()').extract_first() author = quote.xpath( './/*[@itemprop="author"]/text()').extract_first() # string or unicode # quote.xpath('.//*[@itemprop="keywords"]/@content').extract_first() # list # quote.xpath('.//*[@class="tag"]/text()').extract() tags = quote.xpath( './/*[@itemprop="keywords"]/@content').extract_first() # l.add_value('text', text) # l.add_value('author', author) # l.add_value('tags', tags) # print("\n") # print(text) # print(author) # print(tags) # print("\n") yield {'Text': text, 'Author': author, 'Tags': tags} # return l.load_item() # next_page = response.css('li.next a::attr(href)').extract_first() # if next_page is not None: # yield response.follow(next_page, self.parse) next_page_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() absolute_next_page_url = response.urljoin(next_page_url) yield scrapy.Request(absolute_next_page_url)
def scrape_home_page(self, response): #parse call back 하지 않아도 된다 open_in_browser(response) #브라우저를 연다 l = ItemLoader(item=QuotesSpiderItem(), response=response) h1_tag = response.xpath('//h1/a/text()').extract_first() tags = response.xpath('//*[@class = "tag-item"]/a/text()').extract() l.add_value('h1_tag', h1_tag) l.add_value('tags', tags) # quotes =response.xpath('//*[@class="quote"]') # for quote in quotes: # text = quote.xpath('.//*[@class="text"]/text()').extract_first() # author = quote.xpath('.//*[@class = "author"]/text()').extract_first() # tags_q = quote.xpath('.//*[@itemprop = "keywords"]/@content').extract_first() # # l.add_value('text', text) # l.add_value('author', author) # l.add_value('tags_q', tags_q) return l.load_item()
def parse(self, response): quotes = response.xpath('//*[@class="quote"]') for quote in quotes: # define item loader from items.py l = ItemLoader(item=QuotesSpiderItem(), selector=quote) text = quote.xpath('.//*[@class="text"]/text()').extract_first() author = quote.xpath( './/*[@itemprop="author"]/text()').extract_first() tags = quote.xpath( './/*[@itemprop="keywords"]/@content').extract_first() l.add_value('text', text) l.add_value('author', author) l.add_value('tags', tags) yield l.load_item() next_page_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() absoulte_next_page_url = response.urljoin(next_page_url) yield scrapy.Request(absoulte_next_page_url)
def parse(self, response): quotes = response.xpath('//*[@class="quote"]') for quote in quotes: l = ItemLoader(item=QuotesSpiderItem(), response=response) text = quote.xpath('.//*[@class="text"]/text()').extract_first() # print text author = quote.xpath( './/*[@class="author"]/text()').extract_first() # print author # tags = quote.xpath('.//*[@itemprop="keywords"]/@content').extract_first().split(',') tags = quote.xpath('.//*[@class="tag"]/text()').extract() tag_links = quote.xpath('.//*[@class="tag"]/@href').extract() # print tag_links # print tags # yield { # 'text': text, # 'author': author, # 'tags': { # 'tag_names': tags, # 'tag_links': tag_links # } # } l.add_value('text', text) l.add_value('author', author) l.add_value('tags', {'tag_names': tags, 'tag_links': tag_links}) yield l.load_item() next_pg_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() abs_next_pg_url = response.urljoin(next_pg_url) print abs_next_pg_url # yield scrapy.http.Request(abs_next_pg_url, callback=self.parse) yield scrapy.http.Request(abs_next_pg_url)
def parse(self, response): quotes = response.xpath('//*[@class="quote"]') for quote in quotes: text = quote.xpath('.//*[@class="text"]/text()').extract_first() author = quote.xpath( './/*[@itemprop="author"]/text()').extract_first() tags = quote.xpath( './/*[@itemprop="keywords"]/@content').extract_first() print(author) print(tags) l = ItemLoader(item=QuotesSpiderItem(), response=response) l.add_value('text', text) l.add_value('author', author) l.add_value('tags', tags) yield l.load_item() next_page_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() absolute_next_page_url = response.urljoin(next_page_url) yield Request(absolute_next_page_url)
def parse(self, response): l = ItemLoader(item=QuotesSpiderItem(), response=response) quotes = response.xpath('//*[@class="quote"]') for quote in quotes: text = quote.xpath('.//*[@class="text"]/text()').extract_first() author = quote.xpath( './/*[@itemprop="author"]/text()').extract_first() tags = quote.xpath('.//*[@itemprop="keywords"]/@content').extract() yield {'Text': text, 'Author': author, 'Tags': tags} # storing the scraped data into item files l.add_value('Text', text) l.add_value('Author', author) l.add_value('Tags', tags) l.load_item() next_page_url = response.xpath( '//*[@class="next"]/a/@href').extract_first() absolute_page_url = response.urljoin(next_page_url) yield Request(absolute_page_url)