def parse(self, response): """Parse first main sitemap.xml by initial parsing method. Getting sub_sitemaps. """ body = response.body links = Selector(text=body).xpath("//loc/text()").getall() # Parse last sitemap xml number # (in this case: "1"): https://iz.ru/export/sitemap/1/xml sitemap_n = int(links[-1].split("sitemap/")[1].split("/")[0]) # Get last empty sitemap link (main "sitemap.xml" on this site isn't updated frequently enough) # by iterating sitemap links adding "number" to it sitemap_n += 1 while True: link = "https://iz.ru/export/sitemap/{}/xml".format(sitemap_n) body = requests.get(link).content sitemap_links = Selector(text=body).xpath("//loc/text()").getall() # If there are links in this sitemap if sitemap_links: links.append(link) sitemap_n += 1 else: break # Get all links from sitemaps until reach "until_date" for link in links[::-1]: yield Request(url=link, callback=self.parse_sitemap)
def parse_listing_contents(self, response): item = TvshowsItem() item["show_name"] = \ response.xpath('//*[@id="main"]/section/div[1]/div/section[1]/section/div[1]/h2/a/text()').extract()[0] item["status"] = \ response.xpath('//*[@id="media_v4"]/section/div[1]/div/section[1]/p[1]/text()').extract()[0].strip() item["network"] = \ response.xpath('//*[@id="media_v4"]/section/div[1]/div/section[1]/p[2]/a/text()').extract()[0] item["language"] = \ response.xpath('//*[@id="media_v4"]/section/div[1]/div/section[1]/p[4]/text()').extract()[0].strip() item["tv_db_score"] = \ response.xpath('//*[@id="main"]/section/div[1]/div/section[1]/section/div[1]/div/div/span[2]/text()').extract()[0].strip() genre_panel = response.xpath( '//*[@id="media_v4"]/section/div[1]/div/section[2]').extract() i = 1 genres = [] while Selector( text=genre_panel[0]).xpath('//ul/li[' + str(i) + ']/a/text()').extract() != []: if genres == []: genres = \ Selector(text=genre_panel[0]).xpath('//ul/li[' + str(i) + ']/a/text()').extract() else: genres.append( Selector( text=genre_panel[0]).xpath('//ul/li[' + str(i) + ']/a/text()').extract()[0]) i += 1 item["genre"] = genres casts_panel = response.xpath( '//*[@id="main"]/section/div[1]/div/section[2]/ol').extract() i = 1 casts = [] while Selector( text=casts_panel[0]).xpath('//li[' + str(i) + ']/p[1]/a/text()').extract() != []: if casts == []: casts = \ Selector(text=casts_panel[0]).xpath('//li[' + str(i) + ']/p[1]/a/text()').extract() else: casts.append( Selector(text=casts_panel[0]).xpath( '//li[' + str(i) + ']/p[1]/a/text()').extract()[0]) i += 1 item["casts"] = casts yield item