def parseAnimals(self, response): html = HtmlParser(response) if html.extract_urls('//div[@class="pagination"]/a[@class="next"]'): for url in html.extract_urls('//div[@id="search_results"]/div/a'): yield Request(url, callback="parseAnimal") for url in html.extract_urls('//div[@class="pagination"]/a[@class="next"]'): yield Request(url, callback="parseAnimals") else: for url in html.extract_urls('//div[@id="search_results"]/div/a'): yield Request(url, callback="parseAnimal")
def parseAnimals(self,response): html = HtmlParser(response) if html.extract_urls('//div[@class="pagination"]/a[@class="next"]'): for url in html.extract_urls('//div[@id="search_results"]/div/a'): yield Request(url,callback="parseAnimal") for url in html.extract_urls('//div[@class="pagination"]/a[@class="next"]'): yield Request(url,callback="parseAnimals") else: for url in html.extract_urls('//div[@id="search_results"]/div/a'): yield Request(url,callback="parseAnimal")
def parseCat(self, response): parser = HtmlParser(response) if 'Next' not in parser.xpath('//li/a/text()'): for i in parser.xpath('//span[@class="bqQuoteLink"]/a//text()'): self.mydb.quotes.insert({'quote': i}) else: for i in parser.xpath('//span[@class="bqQuoteLink"]/a//text()'): self.mydb.quotes.insert({'quote': i}) for url in parser.extract_urls('//li/a[contains(text(),"Next")]'): yield Request(url, callback="parseCat")
def parseCat(self, response): parser = HtmlParser(response) if 'Next' not in parser.xpath('//li/a/text()'): for i in parser.xpath('//span[@class="bqQuoteLink"]/a//text()'): self.mydb.quotes.insert({'quote':i}) else: for i in parser.xpath('//span[@class="bqQuoteLink"]/a//text()'): self.mydb.quotes.insert({'quote':i}) for url in parser.extract_urls('//li/a[contains(text(),"Next")]'): yield Request(url,callback="parseCat")
def parseCat(self, response): parser = HtmlParser(response) dbname= response.meta['u'] if not parser.xpath('//a[@class="next_page"]'): for i in parser.xpath('//div[@class="quoteText"]'): quote = i.text for j in i.iterfind('a'): author=j.text self.mydb[dbname].insert({'quote':quote,'author':author}) else: for i in parser.xpath('//div[@class="quoteText"]'): quote = i.text for j in i.iterfind('a'): author=j.text self.mydb[dbname].insert({'quote':quote,'author':author}) for url in parser.extract_urls('//a[@class="next_page"]'): yield Request(url,callback="parseCat",meta={'u':dbname})
def parseCat(self, response): parser = HtmlParser(response) dbname = response.meta['u'] if not parser.xpath('//a[@class="next_page"]'): for i in parser.xpath('//div[@class="quoteText"]'): quote = i.text for j in i.iterfind('a'): author = j.text self.mydb[dbname].insert({'quote': quote, 'author': author}) else: for i in parser.xpath('//div[@class="quoteText"]'): quote = i.text for j in i.iterfind('a'): author = j.text self.mydb[dbname].insert({'quote': quote, 'author': author}) for url in parser.extract_urls('//a[@class="next_page"]'): yield Request(url, callback="parseCat", meta={'u': dbname})
def parse(self, response): parser = HtmlParser(response) for i in parser.extract_urls( '//div[@class="bqLn"]/div[@class="bqLn"]/a'): yield Request(i, callback="parseCat")
def parse(self,response): parser = HtmlParser(response) for url in parser.extract_urls('//a[@class="actionLinkLite serif"]'): dbname = url.split('/')[-1] yield Request(url,callback="parseCat",meta={'u':dbname})
def parse(self, response): parser = HtmlParser(response) for url in parser.extract_urls('//a[@class="actionLinkLite serif"]'): dbname = url.split('/')[-1] yield Request(url, callback="parseCat", meta={'u': dbname})
def parse(self, response): html = HtmlParser(response) photo_list = ['//li[@class="first"]', '//li[@class=" "]', '//li[@class="last"]'] for item in photo_list: for url in html.extract_urls(item): yield Request(url, callback="parseAnimals")
def parse(self, response): parser = HtmlParser(response) for i in parser.extract_urls('//div[@class="bqLn"]/div[@class="bqLn"]/a'): yield Request(i,callback="parseCat")
def parse(self,response): html = HtmlParser(response) photo_list = ['//li[@class="first"]','//li[@class=" "]','//li[@class="last"]'] for item in photo_list: for url in html.extract_urls(item): yield Request(url,callback="parseAnimals")