예제 #1
0
    def parse(self, response):
        """Parse first main sitemap.xml by initial parsing method.
        Getting sub_sitemaps.
        """
        body = response.body
        links = Selector(text=body).xpath("//loc/text()").getall()
        # Parse last sitemap xml number
        # (in this case: "1"): https://iz.ru/export/sitemap/1/xml
        sitemap_n = int(links[-1].split("sitemap/")[1].split("/")[0])

        # Get last empty sitemap link (main "sitemap.xml" on this site isn't updated frequently enough)
        # by iterating sitemap links adding "number" to it
        sitemap_n += 1
        while True:
            link = "https://iz.ru/export/sitemap/{}/xml".format(sitemap_n)
            body = requests.get(link).content

            sitemap_links = Selector(text=body).xpath("//loc/text()").getall()
            # If there are links in this sitemap
            if sitemap_links:
                links.append(link)
                sitemap_n += 1
            else:
                break

        # Get all links from sitemaps until reach "until_date"
        for link in links[::-1]:
            yield Request(url=link, callback=self.parse_sitemap)
예제 #2
0
    def parse_listing_contents(self, response):
        item = TvshowsItem()
        item["show_name"] = \
            response.xpath('//*[@id="main"]/section/div[1]/div/section[1]/section/div[1]/h2/a/text()').extract()[0]
        item["status"] = \
            response.xpath('//*[@id="media_v4"]/section/div[1]/div/section[1]/p[1]/text()').extract()[0].strip()
        item["network"] = \
            response.xpath('//*[@id="media_v4"]/section/div[1]/div/section[1]/p[2]/a/text()').extract()[0]
        item["language"] = \
            response.xpath('//*[@id="media_v4"]/section/div[1]/div/section[1]/p[4]/text()').extract()[0].strip()
        item["tv_db_score"] = \
            response.xpath('//*[@id="main"]/section/div[1]/div/section[1]/section/div[1]/div/div/span[2]/text()').extract()[0].strip()

        genre_panel = response.xpath(
            '//*[@id="media_v4"]/section/div[1]/div/section[2]').extract()
        i = 1
        genres = []
        while Selector(
                text=genre_panel[0]).xpath('//ul/li[' + str(i) +
                                           ']/a/text()').extract() != []:
            if genres == []:
                genres = \
                    Selector(text=genre_panel[0]).xpath('//ul/li[' + str(i) + ']/a/text()').extract()

            else:
                genres.append(
                    Selector(
                        text=genre_panel[0]).xpath('//ul/li[' + str(i) +
                                                   ']/a/text()').extract()[0])
            i += 1
        item["genre"] = genres

        casts_panel = response.xpath(
            '//*[@id="main"]/section/div[1]/div/section[2]/ol').extract()
        i = 1
        casts = []
        while Selector(
                text=casts_panel[0]).xpath('//li[' + str(i) +
                                           ']/p[1]/a/text()').extract() != []:

            if casts == []:
                casts = \
                    Selector(text=casts_panel[0]).xpath('//li[' + str(i) + ']/p[1]/a/text()').extract()

            else:
                casts.append(
                    Selector(text=casts_panel[0]).xpath(
                        '//li[' + str(i) + ']/p[1]/a/text()').extract()[0])
            i += 1
        item["casts"] = casts

        yield item